1 package us.codecraft.webmagic.samples;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.Spider;
6 import us.codecraft.webmagic.pipeline.FilePipeline;
7 import us.codecraft.webmagic.processor.PageProcessor;
8 import us.codecraft.webmagic.selector.Html;
9
10 import java.util.List;
11
12
13
14
15
16 public class ZhihuPageProcessor implements PageProcessor {
17
18 private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
19 .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
20 .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
21 .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
22 .setCharset("UTF-8");
23
24 private static final int voteNum = 1000;
25
26
27 @Override
28 public void process(Page page) {
29 List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
30 page.addTargetRequests(relativeUrl);
31 relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
32 page.addTargetRequests(relativeUrl);
33 List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
34 boolean exist = false;
35 for(String answer:answers){
36 String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
37 if(Integer.valueOf(vote) >= voteNum){
38 page.putField("vote",vote);
39 page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
40 page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
41 exist = true;
42 }
43 }
44 if(!exist){
45 page.setSkip(true);
46 }
47 }
48
49 @Override
50 public Site getSite() {
51 return site;
52 }
53
54 public static void main(String[] args) {
55 Spider.create(new ZhihuPageProcessor()).
56 addUrl("http://www.zhihu.com/search?type=question&q=java").
57 addPipeline(new FilePipeline("D:\\webmagic\\")).
58 thread(5).
59 run();
60 }
61 }