1 package us.codecraft.webmagic.configurable;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.processor.PageProcessor;
6 import us.codecraft.webmagic.utils.Experimental;
7
8 import java.util.List;
9
10
11
12
13 @Experimental
14 public class ConfigurablePageProcessor implements PageProcessor {
15
16 private Site site;
17
18 private List<ExtractRule> extractRules;
19
20 public ConfigurablePageProcessor(Site site, List<ExtractRule> extractRules) {
21 this.site = site;
22 this.extractRules = extractRules;
23 }
24
25 @Override
26 public void process(Page page) {
27 for (ExtractRule extractRule : extractRules) {
28 if (extractRule.isMulti()) {
29 List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
30 if (extractRule.isNotNull() && results.size() == 0) {
31 page.setSkip(true);
32 } else {
33 page.getResultItems().put(extractRule.getFieldName(), results);
34 }
35 } else {
36 String result = page.getHtml().selectDocument(extractRule.getSelector());
37 if (extractRule.isNotNull() && result == null) {
38 page.setSkip(true);
39 } else {
40 page.getResultItems().put(extractRule.getFieldName(), result);
41 }
42 }
43 }
44 }
45
46 @Override
47 public Site getSite() {
48 return site;
49 }
50
51 }