View Javadoc
1   package us.codecraft.webmagic.example;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   
6   import us.codecraft.webmagic.*;
7   import us.codecraft.webmagic.handler.CompositePageProcessor;
8   import us.codecraft.webmagic.handler.CompositePipeline;
9   import us.codecraft.webmagic.handler.PatternProcessor;
10  import us.codecraft.webmagic.handler.RequestMatcher;
11  
12  /**
13   * Created with IntelliJ IDEA.
14   * User: Sebastian MA
15   * Date: April 04, 2014
16   * Time: 21:23
17   */
18  public class PatternProcessorExample {
19  
20      private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class);
21  
22      public static void main(String... args) {
23  
24          // define a patternProcessor which handles only "http://item.jd.com/.*"
25          PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") {
26  
27              @Override
28              public RequestMatcher.MatchOther processPage(Page page) {
29                  page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
30                  return RequestMatcher.MatchOther.YES;
31              }
32  
33              @Override
34              public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
35                  log.info("Extracting from repo" + resultItems.getRequest());
36                  System.out.println("Repo name: "+resultItems.get("reponame"));
37                  return RequestMatcher.MatchOther.YES;
38              }
39          };
40  
41          PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") {
42  
43              @Override
44              public RequestMatcher.MatchOther processPage(Page page) {
45                  log.info("Extracting from " + page.getUrl());
46                  page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all());
47                  page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all());
48                  page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString());
49                  return RequestMatcher.MatchOther.YES;
50              }
51  
52              @Override
53              public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
54                  System.out.println("User name: "+resultItems.get("username"));
55                  return RequestMatcher.MatchOther.YES;
56              }
57          };
58  
59          CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(3));
60          CompositePipeline pipeline = new CompositePipeline();
61  
62          pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor);
63          pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor);
64  
65          Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync();
66      }
67  
68  }