1 package us.codecraft.webmagic.samples;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.processor.PageProcessor;
6
7 import java.util.List;
8
9
10
11
12 public class DiandianBlogProcessor implements PageProcessor {
13
14 private Site site;
15
16 @Override
17 public void process(Page page) {
18
19
20
21
22 List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
23
24 page.addTargetRequests(requests);
25
26
27 page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
28
29 page.putField("content", page.getHtml().smartContent());
30 page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
31 page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
32 }
33
34 @Override
35 public Site getSite() {
36
37 if (site == null) {
38 site = Site.me().setDomain("progressdaily.diandian.com").
39 setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
40 }
41 return site;
42 }
43 }