View Javadoc
1   package us.codecraft.webmagic.model;
2   
3   import org.apache.commons.lang3.StringUtils;
4   import org.slf4j.Logger;
5   import org.slf4j.LoggerFactory;
6   
7   import lombok.Getter;
8   import us.codecraft.webmagic.Page;
9   import us.codecraft.webmagic.model.annotation.*;
10  import us.codecraft.webmagic.model.fields.PageField;
11  import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
12  import us.codecraft.webmagic.model.sources.Source;
13  import us.codecraft.webmagic.model.sources.SourceTextExtractor;
14  import us.codecraft.webmagic.model.sources.Source.*;
15  import us.codecraft.webmagic.selector.*;
16  import us.codecraft.webmagic.utils.ClassUtils;
17  import us.codecraft.webmagic.utils.ExtractorUtils;
18  
19  import java.lang.annotation.Annotation;
20  import java.lang.reflect.Field;
21  import java.lang.reflect.Method;
22  import java.util.ArrayList;
23  import java.util.List;
24  import java.util.regex.Pattern;
25  
26  import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;
27  
28  /**
29   * The main internal logic of page model extractor.
30   *
31   * @author code4crafter@gmail.com <br>
32   * @since 0.2.0
33   */
34  class PageModelExtractor {
35  
36      @Getter
37      private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();
38  
39      @Getter
40      private Selector targetUrlRegionSelector;
41  
42      @Getter
43      private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();
44  
45      @Getter
46      private Selector helpUrlRegionSelector;
47  
48      @Getter
49      private Class clazz;
50  
51      private List<FieldExtractor> fieldExtractors;
52  
53      private Extractor objectExtractor;
54  
55      private Logger logger = LoggerFactory.getLogger(getClass());
56  
57      public static PageModelExtractor create(Class clazz) {
58          PageModelExtractor pageModelExtractor = new PageModelExtractor();
59          pageModelExtractor.init(clazz);
60          return pageModelExtractor;
61      }
62  
63      private void init(Class clazz) {
64          this.clazz = clazz;
65          initClassExtractors();
66          fieldExtractors = new ArrayList<FieldExtractor>();
67          for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) {
68              field.setAccessible(true);
69              FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
70              FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
71              if (fieldExtractor != null && fieldExtractorTmp != null) {
72                  throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
73              } else if (fieldExtractor == null && fieldExtractorTmp != null) {
74                  fieldExtractor = fieldExtractorTmp;
75              }
76              fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
77              if (fieldExtractor != null && fieldExtractorTmp != null) {
78                  throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
79              } else if (fieldExtractor == null && fieldExtractorTmp != null) {
80                  fieldExtractor = fieldExtractorTmp;
81              }
82              if (fieldExtractor != null) {
83                  fieldExtractor.setObjectFormatter(new ObjectFormatterBuilder().setField(field).build());
84                  fieldExtractors.add(fieldExtractor);
85              }
86          }
87      }
88  
89      private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
90          FieldExtractor fieldExtractor = null;
91          ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
92          if (extractByUrl != null) {
93              String regexPattern = extractByUrl.value();
94              if (regexPattern.trim().equals("")) {
95                  regexPattern = ".*";
96              }
97              fieldExtractor = new FieldExtractor(field,
98                      new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
99                      extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
100             Method setterMethod = getSetterMethod(clazz, field);
101             if (setterMethod != null) {
102                 fieldExtractor.setSetterMethod(setterMethod);
103             }
104         }
105         return fieldExtractor;
106     }
107 
108     private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
109         FieldExtractor fieldExtractor = null;
110         ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
111         if (comboExtract != null) {
112             ExtractBy[] extractBies = comboExtract.value();
113             Selector selector;
114             switch (comboExtract.op()) {
115                 case And:
116                     selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
117                     break;
118                 case Or:
119                     selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
120                     break;
121                 default:
122                     selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
123             }
124             fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
125                     comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
126             Method setterMethod = getSetterMethod(clazz, field);
127             if (setterMethod != null) {
128                 fieldExtractor.setSetterMethod(setterMethod);
129             }
130         }
131         return fieldExtractor;
132     }
133 
134     private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
135         FieldExtractor fieldExtractor = null;
136         ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
137         if (extractBy != null) {
138             Selector selector = ExtractorUtils.getSelector(extractBy);
139             ExtractBy.Source extractSource = extractBy.source();
140             if (extractBy.type()== ExtractBy.Type.JsonPath)
141                 extractSource = RawText;
142             Source source = null;
143             switch (extractSource) {
144                 case RawText:
145                     source = new RawText();
146                     break;
147                 case RawHtml:
148                     source = new RawHtml();
149                     break;
150                 case SelectedHtml:
151                     source = new SelectedHtml();
152                     break;
153                 default:
154                     source = new SelectedHtml();
155             }
156             fieldExtractor = new FieldExtractor(field, selector, source,
157                     extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
158             fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
159         }
160         return fieldExtractor;
161     }
162 
163     public static Method getSetterMethod(Class clazz, Field field) {
164         String name = "set" + StringUtils.capitalize(field.getName());
165         try {
166             Method declaredMethod = clazz.getDeclaredMethod(name, field.getType());
167             declaredMethod.setAccessible(true);
168             return declaredMethod;
169         } catch (NoSuchMethodException e) {
170             return null;
171         }
172     }
173 
174     private void initClassExtractors() {
175         Annotation annotation = clazz.getAnnotation(TargetUrl.class);
176         if (annotation == null) {
177             targetUrlPatterns.add(Pattern.compile(".*"));
178         } else {
179             TargetUrl targetUrl = (TargetUrl) annotation;
180             String[] value = targetUrl.value();
181             for (String s : value) {
182                 targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
183             }
184             if (!targetUrl.sourceRegion().equals("")) {
185                 targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
186             }
187         }
188         annotation = clazz.getAnnotation(HelpUrl.class);
189         if (annotation != null) {
190             HelpUrl helpUrl = (HelpUrl) annotation;
191             String[] value = helpUrl.value();
192             for (String s : value) {
193                 helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
194             }
195             if (!helpUrl.sourceRegion().equals("")) {
196                 helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
197             }
198         }
199         annotation = clazz.getAnnotation(ExtractBy.class);
200         if (annotation != null) {
201             ExtractBy extractBy = (ExtractBy) annotation;
202             objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
203         }
204     }
205 
206     public Object process(Page page) {
207         boolean matched = false;
208         for (Pattern targetPattern : targetUrlPatterns) {
209             if (targetPattern.matcher(page.getUrl().toString()).matches()) {
210                 matched = true;
211             }
212         }
213         if (!matched) {
214             return null;
215         }
216         if (objectExtractor == null) {
217             return processSingle(page, null, true);
218         } else {
219             if (objectExtractor.multi) {
220                 List<Object> os = new ArrayList<Object>();
221                 List<String> list = objectExtractor.getSelector().selectList(page.getRawText());
222                 for (String s : list) {
223                     Object o = processSingle(page, s, false);
224                     if (o != null) {
225                         os.add(o);
226                     }
227                 }
228                 return os;
229             } else {
230                 String select = objectExtractor.getSelector().select(page.getRawText());
231                 Object o = processSingle(page, select, false);
232                 return o;
233             }
234         }
235     }
236 
237     private Object processSingle(Page page, String html, boolean isRaw) {
238         Object o = null;
239         try {
240             o = clazz.newInstance();
241             for (FieldExtractor fieldExtractor : fieldExtractors) {
242                 PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
243                 if (!field.operation(o, fieldExtractor, logger))
244                     return null;
245             }
246             if (AfterExtractor.class.isAssignableFrom(clazz))
247                 ((AfterExtractor) o).afterProcess(page);
248         } catch (Exception e) {
249             logger.error("extract fail", e);
250         }
251         return o;
252     }
253 }