1 package us.codecraft.webmagic.model.samples;
2
3 import us.codecraft.webmagic.Site;
4 import us.codecraft.webmagic.model.ConsolePageModelPipeline;
5 import us.codecraft.webmagic.model.OOSpider;
6 import us.codecraft.webmagic.model.annotation.ExtractBy;
7 import us.codecraft.webmagic.model.annotation.HelpUrl;
8 import us.codecraft.webmagic.model.annotation.TargetUrl;
9 import us.codecraft.webmagic.scheduler.RedisScheduler;
10
11
12
13
14 @TargetUrl("http://www.jokeji.cn/jokehtml/jy/\\d+.htm")
15 @HelpUrl("http://www.jokeji.cn/list\\w+.htm")
16 public class JokejiModel {
17
18 @ExtractBy("//title/regex('<title>([^_]+)',1)")
19 private String title;
20
21 @ExtractBy("//div[@class=mob_txt]/tidyText()")
22 private String content;
23
24 public static void main(String[] args) {
25 OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000)
26 .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)")
27 , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2)
28 .scheduler(new RedisScheduler("127.0.0.1"))
29 .run();
30 }
31
32 }