1 package us.codecraft.webmagic.model;
2
3 import org.apache.commons.lang3.StringUtils;
4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory;
6
7 import lombok.Getter;
8 import us.codecraft.webmagic.Page;
9 import us.codecraft.webmagic.model.annotation.*;
10 import us.codecraft.webmagic.model.fields.PageField;
11 import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
12 import us.codecraft.webmagic.model.sources.Source;
13 import us.codecraft.webmagic.model.sources.SourceTextExtractor;
14 import us.codecraft.webmagic.model.sources.Source.*;
15 import us.codecraft.webmagic.selector.*;
16 import us.codecraft.webmagic.utils.ClassUtils;
17 import us.codecraft.webmagic.utils.ExtractorUtils;
18
19 import java.lang.annotation.Annotation;
20 import java.lang.reflect.Field;
21 import java.lang.reflect.Method;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.regex.Pattern;
25
26 import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
27
28
29
30
31
32
33
34 class PageModelExtractor {
35
36 @Getter
37 private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
38
39 @Getter
40 private Selector targetUrlRegionSelector;
41
42 @Getter
43 private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
44
45 @Getter
46 private Selector helpUrlRegionSelector;
47
48 @Getter
49 private Class clazz;
50
51 private List<FieldExtractor> fieldExtractors;
52
53 private Extractor objectExtractor;
54
55 private Logger logger = LoggerFactory.getLogger(getClass());
56
57 public static PageModelExtractor create(Class clazz) {
58 PageModelExtractor pageModelExtractor = new PageModelExtractor();
59 pageModelExtractor.init(clazz);
60 return pageModelExtractor;
61 }
62
63 private void init(Class clazz) {
64 this.clazz = clazz;
65 initClassExtractors();
66 fieldExtractors = new ArrayList<FieldExtractor>();
67 for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) {
68 field.setAccessible(true);
69 FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
70 FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
71 if (fieldExtractor != null && fieldExtractorTmp != null) {
72 throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
73 } else if (fieldExtractor == null && fieldExtractorTmp != null) {
74 fieldExtractor = fieldExtractorTmp;
75 }
76 fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
77 if (fieldExtractor != null && fieldExtractorTmp != null) {
78 throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
79 } else if (fieldExtractor == null && fieldExtractorTmp != null) {
80 fieldExtractor = fieldExtractorTmp;
81 }
82 if (fieldExtractor != null) {
83 fieldExtractor.setObjectFormatter(new ObjectFormatterBuilder().setField(field).build());
84 fieldExtractors.add(fieldExtractor);
85 }
86 }
87 }
88
89 private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
90 FieldExtractor fieldExtractor = null;
91 ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
92 if (extractByUrl != null) {
93 String regexPattern = extractByUrl.value();
94 if (regexPattern.trim().equals("")) {
95 regexPattern = ".*";
96 }
97 fieldExtractor = new FieldExtractor(field,
98 new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
99 extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
100 Method setterMethod = getSetterMethod(clazz, field);
101 if (setterMethod != null) {
102 fieldExtractor.setSetterMethod(setterMethod);
103 }
104 }
105 return fieldExtractor;
106 }
107
108 private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
109 FieldExtractor fieldExtractor = null;
110 ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
111 if (comboExtract != null) {
112 ExtractBy[] extractBies = comboExtract.value();
113 Selector selector;
114 switch (comboExtract.op()) {
115 case And:
116 selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
117 break;
118 case Or:
119 selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
120 break;
121 default:
122 selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
123 }
124 fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
125 comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
126 Method setterMethod = getSetterMethod(clazz, field);
127 if (setterMethod != null) {
128 fieldExtractor.setSetterMethod(setterMethod);
129 }
130 }
131 return fieldExtractor;
132 }
133
134 private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
135 FieldExtractor fieldExtractor = null;
136 ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
137 if (extractBy != null) {
138 Selector selector = ExtractorUtils.getSelector(extractBy);
139 ExtractBy.Source extractSource = extractBy.source();
140 if (extractBy.type()== ExtractBy.Type.JsonPath)
141 extractSource = RawText;
142 Source source = null;
143 switch (extractSource) {
144 case RawText:
145 source = new RawText();
146 break;
147 case RawHtml:
148 source = new RawHtml();
149 break;
150 case SelectedHtml:
151 source = new SelectedHtml();
152 break;
153 default:
154 source = new SelectedHtml();
155 }
156 fieldExtractor = new FieldExtractor(field, selector, source,
157 extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
158 fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
159 }
160 return fieldExtractor;
161 }
162
163 public static Method getSetterMethod(Class clazz, Field field) {
164 String name = "set" + StringUtils.capitalize(field.getName());
165 try {
166 Method declaredMethod = clazz.getDeclaredMethod(name, field.getType());
167 declaredMethod.setAccessible(true);
168 return declaredMethod;
169 } catch (NoSuchMethodException e) {
170 return null;
171 }
172 }
173
174 private void initClassExtractors() {
175 Annotation annotation = clazz.getAnnotation(TargetUrl.class);
176 if (annotation == null) {
177 targetUrlPatterns.add(Pattern.compile(".*"));
178 } else {
179 TargetUrl targetUrl = (TargetUrl) annotation;
180 String[] value = targetUrl.value();
181 for (String s : value) {
182 targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
183 }
184 if (!targetUrl.sourceRegion().equals("")) {
185 targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
186 }
187 }
188 annotation = clazz.getAnnotation(HelpUrl.class);
189 if (annotation != null) {
190 HelpUrl helpUrl = (HelpUrl) annotation;
191 String[] value = helpUrl.value();
192 for (String s : value) {
193 helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
194 }
195 if (!helpUrl.sourceRegion().equals("")) {
196 helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
197 }
198 }
199 annotation = clazz.getAnnotation(ExtractBy.class);
200 if (annotation != null) {
201 ExtractBy extractBy = (ExtractBy) annotation;
202 objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
203 }
204 }
205
206 public Object process(Page page) {
207 boolean matched = false;
208 for (Pattern targetPattern : targetUrlPatterns) {
209 if (targetPattern.matcher(page.getUrl().toString()).matches()) {
210 matched = true;
211 }
212 }
213 if (!matched) {
214 return null;
215 }
216 if (objectExtractor == null) {
217 return processSingle(page, null, true);
218 } else {
219 if (objectExtractor.multi) {
220 List<Object> os = new ArrayList<Object>();
221 List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
222 for (String s : list) {
223 Object o = processSingle(page, s, false);
224 if (o != null) {
225 os.add(o);
226 }
227 }
228 return os;
229 } else {
230 String select = objectExtractor.getSelector().select(page.getRawText());
231 Object o = processSingle(page, select, false);
232 return o;
233 }
234 }
235 }
236
237 private Object processSingle(Page page, String html, boolean isRaw) {
238 Object o = null;
239 try {
240 o = clazz.newInstance();
241 for (FieldExtractor fieldExtractor : fieldExtractors) {
242 PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
243 if (!field.operation(o, fieldExtractor, logger))
244 return null;
245 }
246 if (AfterExtractor.class.isAssignableFrom(clazz))
247 ((AfterExtractor) o).afterProcess(page);
248 } catch (Exception e) {
249 logger.error("extract fail", e);
250 }
251 return o;
252 }
253 }