View Javadoc
1   package us.codecraft.webmagic.samples;
2   
3   import us.codecraft.webmagic.Page;
4   import us.codecraft.webmagic.Site;
5   import us.codecraft.webmagic.Spider;
6   import us.codecraft.webmagic.processor.PageProcessor;
7   
8   /**
9    * @author code4crafter@gmail.com <br>
10   * @since 0.5.1
11   */
12  public class GithubRepoPageProcessor implements PageProcessor {
13  
14      private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
15  
16      @Override
17      public void process(Page page) {
18          page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
19          page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
20          GithubRepo githubRepo = new GithubRepo();
21          githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
22          githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
23          githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
24          if (githubRepo.getName() == null) {
25              //skip this page
26              page.setSkip(true);
27          } else {
28              page.putField("repo", githubRepo);
29          }
30      }
31  
32      @Override
33      public Site getSite() {
34          return site;
35      }
36  
37      public static void main(String[] args) {
38          Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
39      }
40  }