View Javadoc
1   package us.codecraft.webmagic.samples;
2   
3   import us.codecraft.webmagic.Page;
4   import us.codecraft.webmagic.Site;
5   import us.codecraft.webmagic.Spider;
6   import us.codecraft.webmagic.processor.PageProcessor;
7   import us.codecraft.webmagic.samples.pipeline.OneFilePipeline;
8   import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
9   import us.codecraft.webmagic.selector.Selectable;
10  
11  import java.io.FileNotFoundException;
12  import java.io.UnsupportedEncodingException;
13  import java.util.List;
14  
15  /**
16   * @author code4crafer@gmail.com
17   */
18  public class MamacnPageProcessor implements PageProcessor {
19  
20      private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100);
21  
22      @Override
23      public void process(Page page) {
24          List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
25          StringBuilder accum = new StringBuilder();
26          for (Selectable node : nodes) {
27              accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
28              accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
29          }
30          page.putField("",accum.toString());
31          if (accum.length() == 0) {
32              page.setSkip(true);
33          }
34          page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
35      }
36  
37      @Override
38      public Site getSite() {
39          return site;
40      }
41  
42      public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
43          Spider.create(new MamacnPageProcessor())
44                  .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
45                  .addUrl("http://www.mama.cn/photo/t1-p1.html")
46                  .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
47                  .thread(5)
48                  .run();
49      }
50  }