View Javadoc
1   package us.codecraft.webmagic.model;
2   
3   import us.codecraft.webmagic.Page;
4   import us.codecraft.webmagic.Request;
5   import us.codecraft.webmagic.Site;
6   import us.codecraft.webmagic.processor.PageProcessor;
7   import us.codecraft.webmagic.selector.Selector;
8   
9   import java.util.ArrayList;
10  import java.util.List;
11  import java.util.regex.Matcher;
12  import java.util.regex.Pattern;
13  
14  /**
15   * The extension to PageProcessor for page model extractor.
16   *
17   * @author code4crafter@gmail.com <br>
18   * @since 0.2.0
19   */
20  class ModelPageProcessor implements PageProcessor {
21  
22      private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
23  
24      private Site site;
25  
26      private boolean extractLinks = true;
27  
28      public static ModelPageProcessor create(Site site, Class... clazzs) {
29          ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
30          for (Class clazz : clazzs) {
31              modelPageProcessor.addPageModel(clazz);
32          }
33          return modelPageProcessor;
34      }
35  
36  
37      public ModelPageProcessor addPageModel(Class clazz) {
38          PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
39          pageModelExtractorList.add(pageModelExtractor);
40          return this;
41      }
42  
43      private ModelPageProcessor(Site site) {
44          this.site = site;
45      }
46  
47      @Override
48      public void process(Page page) {
49          for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
50              if (extractLinks) {
51                  extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
52                  extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
53              }
54              Object process = pageModelExtractor.process(page);
55              if (process == null || (process instanceof List && ((List) process).size() == 0)) {
56                  continue;
57              }
58              postProcessPageModel(pageModelExtractor.getClazz(), process);
59              page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
60          }
61          if (page.getResultItems().getAll().size() == 0) {
62              page.getResultItems().setSkip(true);
63          }
64      }
65  
66      private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
67          List<String> links;
68          if (urlRegionSelector == null) {
69              links = page.getHtml().links().all();
70          } else {
71              links = page.getHtml().selectList(urlRegionSelector).links().all();
72          }
73          for (String link : links) {
74              for (Pattern targetUrlPattern : urlPatterns) {
75                  Matcher matcher = targetUrlPattern.matcher(link);
76                  if (matcher.find()) {
77                      page.addTargetRequest(new Request(matcher.group(0)));
78                  }
79              }
80          }
81      }
82  
83      protected void postProcessPageModel(Class clazz, Object object) {
84      }
85  
86      @Override
87      public Site getSite() {
88          return site;
89      }
90  
91      public boolean isExtractLinks() {
92          return extractLinks;
93      }
94  
95      public void setExtractLinks(boolean extractLinks) {
96          this.extractLinks = extractLinks;
97      }
98  }