View Javadoc
1   package us.codecraft.webmagic.downloader;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   import us.codecraft.webmagic.Page;
6   import us.codecraft.webmagic.Request;
7   import us.codecraft.webmagic.Task;
8   import us.codecraft.webmagic.selector.PlainText;
9   
10  import java.io.*;
11  
12  /**
13   * this downloader is used to download pages which need to render the javascript
14   *
15   * @author dolphineor@gmail.com
16   * @version 0.5.3
17   */
18  public class PhantomJSDownloader extends AbstractDownloader {
19      private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
20      private static String crawlJsPath;
21      private static String phantomJsCommand = "phantomjs"; // default
22  
23      public PhantomJSDownloader() {
24          this.initPhantomjsCrawlPath();
25      }
26  
27      /**
28       * 添加新的构造函数,支持phantomjs自定义命令
29       * <p>
30       * example:
31       * phantomjs.exe 支持windows环境
32       * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
33       * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
34       *
35       * @param phantomJsCommand phantomJsCommand
36       */
37      public PhantomJSDownloader(String phantomJsCommand) {
38          this.initPhantomjsCrawlPath();
39          PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
40      }
41  
42      /**
43       * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
44       * <pre>
45       * crawl.js start --
46       *
47       *   var system = require('system');
48       *   var url = system.args[1];
49       *
50       *   var page = require('webpage').create();
51       *   page.settings.loadImages = false;
52       *   page.settings.resourceTimeout = 5000;
53       *
54       *   page.open(url, function (status) {
55       *       if (status != 'success') {
56       *           console.log("HTTP request failed!");
57       *       } else {
58       *           console.log(page.content);
59       *       }
60       *
61       *       page.close();
62       *       phantom.exit();
63       *   });
64       *
65       * -- crawl.js end
66       * </pre>
67       * 具体项目时可以将以上js代码复制下来使用
68       * <p>
69       * example:
70       * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
71       *
72       * @param phantomJsCommand phantomJsCommand
73       * @param crawlJsPath      crawlJsPath
74       */
75      public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
76          PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
77          PhantomJSDownloader.crawlJsPath = crawlJsPath;
78      }
79  
80      private void initPhantomjsCrawlPath() {
81          PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
82                  + System.getProperty("file.separator") + "crawl.js ";
83      }
84  
85      @Override
86      public Page download(Request request, Task task) {
87          if (logger.isInfoEnabled()) {
88              logger.info("downloading page: " + request.getUrl());
89          }
90  
91          Page page = Page.fail(request);
92          try {
93              String content = getPage(request);
94              if (!content.contains("HTTP request failed")) {
95                  page.setDownloadSuccess(true);
96                  page.setRawText(content);
97                  page.setUrl(new PlainText(request.getUrl()));
98                  page.setRequest(request);
99                  page.setStatusCode(200);
100             }
101             onSuccess(page, task);
102         } catch (Exception e) {
103             onError(page, task, e);
104             logger.warn("download page {} error", request.getUrl(), e);
105         }
106         return page;
107     }
108 
109     @Override
110     public void setThread(int threadNum) {
111         // ignore
112     }
113 
114     protected String getPage(Request request) throws Exception {
115         String url = request.getUrl();
116         Runtime runtime = Runtime.getRuntime();
117         Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
118         InputStream is = process.getInputStream();
119         BufferedReader br = new BufferedReader(new InputStreamReader(is));
120         StringBuilder builder = new StringBuilder();
121         String line;
122         while ((line = br.readLine()) != null) {
123             builder.append(line).append("\n");
124         }
125         return builder.toString();
126     }
127 }