1 package us.codecraft.webmagic.downloader;
2
3 import org.slf4j.Logger;
4 import org.slf4j.LoggerFactory;
5 import us.codecraft.webmagic.Page;
6 import us.codecraft.webmagic.Request;
7 import us.codecraft.webmagic.Task;
8 import us.codecraft.webmagic.selector.PlainText;
9
10 import java.io.*;
11
12
13
14
15
16
17
18 public class PhantomJSDownloader extends AbstractDownloader {
19 private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
20 private static String crawlJsPath;
21 private static String phantomJsCommand = "phantomjs";
22
23 public PhantomJSDownloader() {
24 this.initPhantomjsCrawlPath();
25 }
26
27
28
29
30
31
32
33
34
35
36
37 public PhantomJSDownloader(String phantomJsCommand) {
38 this.initPhantomjsCrawlPath();
39 PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
40 }
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75 public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
76 PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
77 PhantomJSDownloader.crawlJsPath = crawlJsPath;
78 }
79
80 private void initPhantomjsCrawlPath() {
81 PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
82 + System.getProperty("file.separator") + "crawl.js ";
83 }
84
85 @Override
86 public Page download(Request request, Task task) {
87 if (logger.isInfoEnabled()) {
88 logger.info("downloading page: " + request.getUrl());
89 }
90
91 Page page = Page.fail(request);
92 try {
93 String content = getPage(request);
94 if (!content.contains("HTTP request failed")) {
95 page.setDownloadSuccess(true);
96 page.setRawText(content);
97 page.setUrl(new PlainText(request.getUrl()));
98 page.setRequest(request);
99 page.setStatusCode(200);
100 }
101 onSuccess(page, task);
102 } catch (Exception e) {
103 onError(page, task, e);
104 logger.warn("download page {} error", request.getUrl(), e);
105 }
106 return page;
107 }
108
109 @Override
110 public void setThread(int threadNum) {
111
112 }
113
114 protected String getPage(Request request) throws Exception {
115 String url = request.getUrl();
116 Runtime runtime = Runtime.getRuntime();
117 Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
118 InputStream is = process.getInputStream();
119 BufferedReader br = new BufferedReader(new InputStreamReader(is));
120 StringBuilder builder = new StringBuilder();
121 String line;
122 while ((line = br.readLine()) != null) {
123 builder.append(line).append("\n");
124 }
125 return builder.toString();
126 }
127 }