View Javadoc
1   package us.codecraft.webmagic.model.samples;
2   
3   import us.codecraft.webmagic.Site;
4   import us.codecraft.webmagic.Task;
5   import us.codecraft.webmagic.model.OOSpider;
6   import us.codecraft.webmagic.pipeline.PageModelPipeline;
7   import us.codecraft.webmagic.model.annotation.ExtractBy;
8   import us.codecraft.webmagic.model.annotation.TargetUrl;
9   
10  import java.util.List;
11  
12  /**
13   * @author code4crafter@gmail.com <br>
14   */
15  @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
16  public class OschinaBlog{
17  
18      @ExtractBy("//title")
19      private String title;
20  
21      @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
22      private String content;
23  
24      @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
25      private List<String> tags;
26  
27      public static void main(String[] args) {
28          OOSpider.create(Site.me()
29                  .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
30                  .setSleepTime(0)
31                  .setRetryTimes(3)
32                  ,new PageModelPipeline() {
33              @Override
34              public void process(Object o, Task task) {
35  
36              }
37          }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
38      }
39  
40      public String getTitle() {
41          return title;
42      }
43  
44      public String getContent() {
45          return content;
46      }
47  
48      public List<String> getTags() {
49          return tags;
50      }
51  }