View Javadoc
1   package us.codecraft.webmagic.model.samples;
2   
3   import us.codecraft.webmagic.Site;
4   import us.codecraft.webmagic.model.ConsolePageModelPipeline;
5   import us.codecraft.webmagic.model.OOSpider;
6   import us.codecraft.webmagic.model.annotation.ExtractBy;
7   import us.codecraft.webmagic.model.annotation.HelpUrl;
8   import us.codecraft.webmagic.model.annotation.TargetUrl;
9   import us.codecraft.webmagic.scheduler.RedisScheduler;
10  
11  /**
12   * @author code4crafter@gmail.com
13   */
14  @TargetUrl("http://www.jokeji.cn/jokehtml/jy/\\d+.htm")
15  @HelpUrl("http://www.jokeji.cn/list\\w+.htm")
16  public class JokejiModel {
17  
18      @ExtractBy("//title/regex('<title>([^_]+)',1)")
19      private String title;
20  
21      @ExtractBy("//div[@class=mob_txt]/tidyText()")
22      private String content;
23  
24      public static void main(String[] args) {
25          OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000)
26                  .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)")
27                  , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2)
28                  .scheduler(new RedisScheduler("127.0.0.1"))
29                  .run();
30      }
31  
32  }