View Javadoc
1   package us.codecraft.webmagic.model.samples;
2   
3   import us.codecraft.webmagic.MultiPageModel;
4   import us.codecraft.webmagic.Site;
5   import us.codecraft.webmagic.model.OOSpider;
6   import us.codecraft.webmagic.model.annotation.ExtractBy;
7   import us.codecraft.webmagic.model.annotation.ExtractByUrl;
8   import us.codecraft.webmagic.model.annotation.TargetUrl;
9   import us.codecraft.webmagic.pipeline.ConsolePipeline;
10  import us.codecraft.webmagic.pipeline.MultiPagePipeline;
11  import us.codecraft.webmagic.scheduler.RedisScheduler;
12  
13  import java.util.Collection;
14  import java.util.List;
15  
16  /**
17   * @author code4crafter@gmail.com <br>
18   */
19  @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
20  public class News163 implements MultiPageModel {
21  
22      @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
23      private String pageKey;
24  
25      @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
26      private String page;
27  
28      @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)"
29              , multi = true, notNull = false)
30      private List<String> otherPage;
31  
32      @ExtractBy("//h1[@id=\"h1title\"]/text()")
33      private String title;
34  
35      @ExtractBy("//div[@id=\"epContentLeft\"]")
36      private String content;
37  
38      @Override
39      public String getPageKey() {
40          return pageKey;
41      }
42  
43      @Override
44      public Collection<String> getOtherPages() {
45          return otherPage;
46      }
47  
48      @Override
49      public String getPage() {
50          if (page == null) {
51              return "1";
52          }
53          return page;
54      }
55  
56      @Override
57      public MultiPageModel combine(MultiPageModel multiPageModel) {
58          News163 news163 = new News163();
59          news163.title = this.title;
60          News163 pagedModel1 = (News163) multiPageModel;
61          news163.content = this.content + pagedModel1.content;
62          return news163;
63      }
64  
65      @Override
66      public String toString() {
67          return "News163{" +
68                  "content='" + content + '\'' +
69                  ", title='" + title + '\'' +
70                  ", otherPage=" + otherPage +
71                  '}';
72      }
73  
74      public static void main(String[] args) {
75          OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html")
76                  .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
77      }
78  
79  }