1 package us.codecraft.webmagic.samples.scheduler;
2
3 import org.apache.commons.lang3.StringUtils;
4 import us.codecraft.webmagic.Page;
5 import us.codecraft.webmagic.Request;
6 import us.codecraft.webmagic.Site;
7 import us.codecraft.webmagic.Spider;
8 import us.codecraft.webmagic.processor.PageProcessor;
9 import us.codecraft.webmagic.scheduler.PriorityScheduler;
10
11 import java.util.List;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 import static us.codecraft.webmagic.selector.Selectors.xpath;
16
17
18
19
20 public class ZipCodePageProcessor implements PageProcessor {
21
22 private Site site = Site.me().setCharset("gb2312")
23 .setSleepTime(100);
24
25 @Override
26 public void process(Page page) {
27 if (page.getUrl().toString().equals("http://www.ip138.com/post/")) {
28 processCountry(page);
29 } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").toString() != null) {
30 processDistrict(page);
31 } else {
32 processProvince(page);
33 }
34
35 }
36
37 private void processCountry(Page page) {
38 List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
39 for (String province : provinces) {
40 String link = xpath("//@href").select(province);
41 String title = xpath("/text()").select(province);
42 Request request = new Request(link).setPriority(0).putExtra("province", title);
43 page.addTargetRequest(request);
44 }
45 }
46
47 private void processProvince(Page page) {
48
49 List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
50 Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
51 for (String district : districts) {
52 Matcher matcher = pattern.matcher(district);
53 while (matcher.find()) {
54 String title = matcher.group(1);
55 String link = matcher.group(2);
56 Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
57 page.addTargetRequest(request);
58 }
59 }
60 }
61
62 private void processDistrict(Page page) {
63 String province = page.getRequest().getExtra("province").toString();
64 String district = page.getRequest().getExtra("district").toString();
65 String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
66 page.putField("result", StringUtils.join(new String[]{province, district,
67 zipCode}, "\t"));
68 List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
69 for (String link : links) {
70 page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
71 }
72
73 }
74
75 @Override
76 public Site getSite() {
77 return site;
78 }
79
80 public static void main(String[] args) {
81 Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).addUrl("http://www.ip138.com/post/");
82
83 spider.run();
84 }
85 }