1 package us.codecraft.webmagic.samples;
2
3
4 import java.util.List;
5 import org.apache.commons.collections4.CollectionUtils;
6 import us.codecraft.webmagic.Page;
7 import us.codecraft.webmagic.Site;
8 import us.codecraft.webmagic.Spider;
9 import us.codecraft.webmagic.processor.PageProcessor;
10 import us.codecraft.webmagic.selector.JsonPathSelector;
11
12
13
14
15
16 public class AngularJSProcessor implements PageProcessor {
17
18 private Site site = Site.me();
19
20 private static final String ARITICALE_URL = "http://angularjs\\.cn/api/article/\\w+";
21
22 private static final String LIST_URL = "http://angularjs\\.cn/api/article/latest.*";
23
24 @Override
25 public void process(Page page) {
26 if (page.getUrl().regex(LIST_URL).match()) {
27 List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
28 if (CollectionUtils.isNotEmpty(ids)) {
29 for (String id : ids) {
30 page.addTargetRequest("http://angularjs.cn/api/article/" + id);
31 }
32 }
33 } else {
34 page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
35 page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
36 }
37
38 }
39
40 @Override
41 public Site getSite() {
42 return site;
43 }
44
45 public static void main(String[] args) {
46 Spider.create(new AngularJSProcessor()).addUrl("http://angularjs.cn/api/article/latest?p=1&s=20").run();
47 }
48 }