1 package us.codecraft.webmagic.samples;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.processor.PageProcessor;
6
7 import java.util.List;
8
9
10
11
12 public class QzoneBlogProcessor implements PageProcessor {
13 @Override
14 public void process(Page page) {
15
16
17
18
19 List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
20 page.addTargetRequests(requests);
21 page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
22 page.putField("content",page.getHtml().smartContent());
23 }
24
25 @Override
26 public Site getSite() {
27 return Site.me().setDomain("www.diandian.com").
28 setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
29 }
30 }