View Javadoc
1   package us.codecraft.webmagic.configurable;
2   
3   import us.codecraft.webmagic.Page;
4   import us.codecraft.webmagic.Site;
5   import us.codecraft.webmagic.processor.PageProcessor;
6   import us.codecraft.webmagic.utils.Experimental;
7   
8   import java.util.List;
9   
10  /**
11   * @author code4crafter@gmail.com <br>
12   */
13  @Experimental
14  public class ConfigurablePageProcessor implements PageProcessor {
15  
16      private Site site;
17  
18      private List<ExtractRule> extractRules;
19  
20      public ConfigurablePageProcessor(Site site, List<ExtractRule> extractRules) {
21          this.site = site;
22          this.extractRules = extractRules;
23      }
24  
25      @Override
26      public void process(Page page) {
27          for (ExtractRule extractRule : extractRules) {
28              if (extractRule.isMulti()) {
29                  List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
30                  if (extractRule.isNotNull() && results.size() == 0) {
31                      page.setSkip(true);
32                  } else {
33                      page.getResultItems().put(extractRule.getFieldName(), results);
34                  }
35              } else {
36                  String result = page.getHtml().selectDocument(extractRule.getSelector());
37                  if (extractRule.isNotNull() && result == null) {
38                      page.setSkip(true);
39                  } else {
40                      page.getResultItems().put(extractRule.getFieldName(), result);
41                  }
42              }
43          }
44      }
45  
46      @Override
47      public Site getSite() {
48          return site;
49      }
50  
51  }