1 package us.codecraft.webmagic.samples;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.Spider;
6 import us.codecraft.webmagic.processor.PageProcessor;
7
8
9
10
11
12 public class GithubRepoPageProcessor implements PageProcessor {
13
14 private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
15
16 @Override
17 public void process(Page page) {
18 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
19 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
20 GithubRepo githubRepo = new GithubRepo();
21 githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
22 githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
23 githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
24 if (githubRepo.getName() == null) {
25
26 page.setSkip(true);
27 } else {
28 page.putField("repo", githubRepo);
29 }
30 }
31
32 @Override
33 public Site getSite() {
34 return site;
35 }
36
37 public static void main(String[] args) {
38 Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
39 }
40 }