1 package us.codecraft.webmagic.model.samples;
2
3 import us.codecraft.webmagic.MultiPageModel;
4 import us.codecraft.webmagic.Site;
5 import us.codecraft.webmagic.model.OOSpider;
6 import us.codecraft.webmagic.model.annotation.ExtractBy;
7 import us.codecraft.webmagic.model.annotation.ExtractByUrl;
8 import us.codecraft.webmagic.model.annotation.TargetUrl;
9 import us.codecraft.webmagic.pipeline.ConsolePipeline;
10 import us.codecraft.webmagic.pipeline.MultiPagePipeline;
11 import us.codecraft.webmagic.scheduler.RedisScheduler;
12
13 import java.util.Collection;
14 import java.util.List;
15
16
17
18
19 @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
20 public class News163 implements MultiPageModel {
21
22 @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
23 private String pageKey;
24
25 @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
26 private String page;
27
28 @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)"
29 , multi = true, notNull = false)
30 private List<String> otherPage;
31
32 @ExtractBy("//h1[@id=\"h1title\"]/text()")
33 private String title;
34
35 @ExtractBy("//div[@id=\"epContentLeft\"]")
36 private String content;
37
38 @Override
39 public String getPageKey() {
40 return pageKey;
41 }
42
43 @Override
44 public Collection<String> getOtherPages() {
45 return otherPage;
46 }
47
48 @Override
49 public String getPage() {
50 if (page == null) {
51 return "1";
52 }
53 return page;
54 }
55
56 @Override
57 public MultiPageModel combine(MultiPageModel multiPageModel) {
58 News163 news163 = new News163();
59 news163.title = this.title;
60 News163 pagedModel1 = (News163) multiPageModel;
61 news163.content = this.content + pagedModel1.content;
62 return news163;
63 }
64
65 @Override
66 public String toString() {
67 return "News163{" +
68 "content='" + content + '\'' +
69 ", title='" + title + '\'' +
70 ", otherPage=" + otherPage +
71 '}';
72 }
73
74 public static void main(String[] args) {
75 OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html")
76 .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
77 }
78
79 }