1 package us.codecraft.webmagic.example;
2
3 import org.slf4j.Logger;
4 import org.slf4j.LoggerFactory;
5
6 import us.codecraft.webmagic.*;
7 import us.codecraft.webmagic.handler.CompositePageProcessor;
8 import us.codecraft.webmagic.handler.CompositePipeline;
9 import us.codecraft.webmagic.handler.PatternProcessor;
10 import us.codecraft.webmagic.handler.RequestMatcher;
11
12
13
14
15
16
17
18 public class PatternProcessorExample {
19
20 private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class);
21
22 public static void main(String... args) {
23
24
25 PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") {
26
27 @Override
28 public RequestMatcher.MatchOther processPage(Page page) {
29 page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
30 return RequestMatcher.MatchOther.YES;
31 }
32
33 @Override
34 public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
35 log.info("Extracting from repo" + resultItems.getRequest());
36 System.out.println("Repo name: "+resultItems.get("reponame"));
37 return RequestMatcher.MatchOther.YES;
38 }
39 };
40
41 PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") {
42
43 @Override
44 public RequestMatcher.MatchOther processPage(Page page) {
45 log.info("Extracting from " + page.getUrl());
46 page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all());
47 page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all());
48 page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString());
49 return RequestMatcher.MatchOther.YES;
50 }
51
52 @Override
53 public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
54 System.out.println("User name: "+resultItems.get("username"));
55 return RequestMatcher.MatchOther.YES;
56 }
57 };
58
59 CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(3));
60 CompositePipeline pipeline = new CompositePipeline();
61
62 pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor);
63 pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor);
64
65 Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync();
66 }
67
68 }