1 package us.codecraft.webmagic.samples;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.Spider;
6 import us.codecraft.webmagic.processor.PageProcessor;
7 import us.codecraft.webmagic.selector.PlainText;
8
9 import java.util.List;
10
11
12
13
14
15
16 public class DiaoyuwengProcessor implements PageProcessor {
17
18 private Site site;
19
20 @Override
21 public void process(Page page) {
22 List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
23 page.addTargetRequests(requests);
24 requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
25 page.addTargetRequests(requests);
26 if (page.getUrl().toString().contains("thread")){
27 page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
28 page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
29 page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
30 page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
31 }
32 }
33
34 @Override
35 public Site getSite() {
36 if (site==null){
37 site= Site.me().setDomain("www.diaoyuweng.com").
38 setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
39 }
40 return site;
41 }
42
43 public static void main(String[] args) {
44 Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run();
45 }
46 }