View Javadoc
1   package us.codecraft.webmagic.samples;
2   
3   import org.apache.commons.collections4.CollectionUtils;
4   import us.codecraft.webmagic.Page;
5   import us.codecraft.webmagic.Site;
6   import us.codecraft.webmagic.Spider;
7   import us.codecraft.webmagic.processor.PageProcessor;
8   
9   import java.util.List;
10  
11  /**
12   * @author code4crafter@gmail.com <br>
13   */
14  public class InfoQMiniBookProcessor implements PageProcessor {
15  
16      private Site site;
17  
18      @Override
19      public void process(Page page) {
20          page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all());
21          List<String> all = page.getHtml().links().regex(".*\\.pdf").all();
22          if (CollectionUtils.isNotEmpty(all)) {
23              page.putField("pdf", all);
24          } else {
25              page.getResultItems().setSkip(true);
26          }
27      }
28  
29      @Override
30      public Site getSite() {
31          if (site == null) {
32              site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
33                      setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
34          }
35          return site;
36      }
37  
38      public static void main(String[] args) {
39          Spider.create(new InfoQMiniBookProcessor())
40                  .thread(5)
41                  .addUrl("http://www.infoq.com/cn/minibooks")
42                  .run();
43      }
44  }