1 package us.codecraft.webmagic.example;
2
3 import us.codecraft.webmagic.Page;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.Spider;
6 import us.codecraft.webmagic.model.PageMapper;
7 import us.codecraft.webmagic.processor.PageProcessor;
8
9
10
11
12
13 public class GithubRepoPageMapper implements PageProcessor {
14
15 private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
16
17 private PageMapper<GithubRepo> githubRepoPageMapper = new PageMapper<GithubRepo>(GithubRepo.class);
18
19 @Override
20 public void process(Page page) {
21 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
22 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
23 GithubRepo githubRepo = githubRepoPageMapper.get(page);
24 if (githubRepo == null) {
25 page.setSkip(true);
26 } else {
27 page.putField("repo", githubRepo);
28 }
29
30 }
31
32 @Override
33 public Site getSite() {
34 return site;
35 }
36
37 public static void main(String[] args) {
38 Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run();
39 }
40 }