View Javadoc
1   package us.codecraft.webmagic.samples;
2   
3   
4   import java.util.List;
5   import org.apache.commons.collections4.CollectionUtils;
6   import us.codecraft.webmagic.Page;
7   import us.codecraft.webmagic.Site;
8   import us.codecraft.webmagic.Spider;
9   import us.codecraft.webmagic.processor.PageProcessor;
10  import us.codecraft.webmagic.selector.JsonPathSelector;
11  
12  /**
13   * @author code4crafter@gmail.com
14   * @since 0.5.0
15   */
16  public class AngularJSProcessor implements PageProcessor {
17  
18      private Site site = Site.me();
19  
20      private static final String ARITICALE_URL = "http://angularjs\\.cn/api/article/\\w+";
21  
22      private static final String LIST_URL = "http://angularjs\\.cn/api/article/latest.*";
23  
24      @Override
25      public void process(Page page) {
26          if (page.getUrl().regex(LIST_URL).match()) {
27              List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
28              if (CollectionUtils.isNotEmpty(ids)) {
29                  for (String id : ids) {
30                      page.addTargetRequest("http://angularjs.cn/api/article/" + id);
31                  }
32              }
33          } else {
34              page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
35              page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
36          }
37  
38      }
39  
40      @Override
41      public Site getSite() {
42          return site;
43      }
44  
45      public static void main(String[] args) {
46          Spider.create(new AngularJSProcessor()).addUrl("http://angularjs.cn/api/article/latest?p=1&s=20").run();
47      }
48  }