1 package us.codecraft.webmagic.model;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Request;
5 import us.codecraft.webmagic.Site;
6 import us.codecraft.webmagic.processor.PageProcessor;
7 import us.codecraft.webmagic.selector.Selector;
8
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13
14
15
16
17
18
19
20 class ModelPageProcessor implements PageProcessor {
21
22 private List<PageModelExtractor> pageModelExtractorList = new ArrayList<PageModelExtractor>();
23
24 private Site site;
25
26 private boolean extractLinks = true;
27
28 public static ModelPageProcessor create(Site site, Class... clazzs) {
29 ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
30 for (Class clazz : clazzs) {
31 modelPageProcessor.addPageModel(clazz);
32 }
33 return modelPageProcessor;
34 }
35
36
37 public ModelPageProcessor addPageModel(Class clazz) {
38 PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
39 pageModelExtractorList.add(pageModelExtractor);
40 return this;
41 }
42
43 private ModelPageProcessor(Site site) {
44 this.site = site;
45 }
46
47 @Override
48 public void process(Page page) {
49 for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
50 if (extractLinks) {
51 extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
52 extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
53 }
54 Object process = pageModelExtractor.process(page);
55 if (process == null || (process instanceof List && ((List) process).size() == 0)) {
56 continue;
57 }
58 postProcessPageModel(pageModelExtractor.getClazz(), process);
59 page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
60 }
61 if (page.getResultItems().getAll().size() == 0) {
62 page.getResultItems().setSkip(true);
63 }
64 }
65
66 private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
67 List<String> links;
68 if (urlRegionSelector == null) {
69 links = page.getHtml().links().all();
70 } else {
71 links = page.getHtml().selectList(urlRegionSelector).links().all();
72 }
73 for (String link : links) {
74 for (Pattern targetUrlPattern : urlPatterns) {
75 Matcher matcher = targetUrlPattern.matcher(link);
76 if (matcher.find()) {
77 page.addTargetRequest(new Request(matcher.group(0)));
78 }
79 }
80 }
81 }
82
83 protected void postProcessPageModel(Class clazz, Object object) {
84 }
85
86 @Override
87 public Site getSite() {
88 return site;
89 }
90
91 public boolean isExtractLinks() {
92 return extractLinks;
93 }
94
95 public void setExtractLinks(boolean extractLinks) {
96 this.extractLinks = extractLinks;
97 }
98 }