View Javadoc
1   package us.codecraft.webmagic.downloader.selenium;
2   
3   import org.openqa.selenium.By;
4   import org.openqa.selenium.Cookie;
5   import org.openqa.selenium.WebDriver;
6   import org.openqa.selenium.WebElement;
7   import org.slf4j.Logger;
8   import org.slf4j.LoggerFactory;
9   
10  import us.codecraft.webmagic.Page;
11  import us.codecraft.webmagic.Request;
12  import us.codecraft.webmagic.Site;
13  import us.codecraft.webmagic.Task;
14  import us.codecraft.webmagic.downloader.AbstractDownloader;
15  import us.codecraft.webmagic.selector.Html;
16  import us.codecraft.webmagic.selector.PlainText;
17  
18  import java.io.Closeable;
19  import java.io.IOException;
20  import java.util.Map;
21  
22  /**
23   * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。<br>
24   * 需要下载Selenium driver支持。<br>
25   *
26   * @author code4crafter@gmail.com <br>
27   * Date: 13-7-26 <br>
28   * Time: 下午1:37 <br>
29   */
30  public class SeleniumDownloader extends AbstractDownloader implements Closeable {
31  
32      private volatile WebDriverPool webDriverPool;
33  
34      private Logger logger = LoggerFactory.getLogger(getClass());
35  
36      private int sleepTime = 0;
37  
38      private int poolSize = 1;
39  
40      private static final String DRIVER_PHANTOMJS = "phantomjs";
41  
42      /**
43       * 新建
44       *
45       * @param chromeDriverPath chromeDriverPath
46       */
47      public SeleniumDownloader(String chromeDriverPath) {
48          System.getProperties().setProperty("webdriver.chrome.driver",
49                  chromeDriverPath);
50      }
51  
52      /**
53       * Constructor without any filed. Construct PhantomJS browser
54       *
55       * @author bob.li.0718@gmail.com
56       */
57      public SeleniumDownloader() {
58          // System.setProperty("phantomjs.binary.path",
59          // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
60      }
61  
62      /**
63       * set sleep time to wait until load success
64       *
65       * @param sleepTime sleepTime
66       * @return this
67       */
68      public SeleniumDownloader setSleepTime(int sleepTime) {
69          this.sleepTime = sleepTime;
70          return this;
71      }
72  
73      @Override
74      public Page download(Request request, Task task) {
75          checkInit();
76          WebDriver webDriver = null;
77          Page page = Page.fail(request);
78          try {
79              webDriver = webDriverPool.get();
80  
81              logger.info("downloading page " + request.getUrl());
82              webDriver.get(request.getUrl());
83              try {
84                  if (sleepTime > 0) {
85                      Thread.sleep(sleepTime);
86                  }
87              } catch (InterruptedException e) {
88                  e.printStackTrace();
89              }
90              WebDriver.Options manage = webDriver.manage();
91              Site site = task.getSite();
92              if (site.getCookies() != null) {
93                  for (Map.Entry<String, String> cookieEntry : site.getCookies()
94                          .entrySet()) {
95                      Cookie cookie = new Cookie(cookieEntry.getKey(),
96                              cookieEntry.getValue());
97                      manage.addCookie(cookie);
98                  }
99              }
100 
101             /*
102              * TODO You can add mouse event or other processes
103              *
104              * @author: bob.li.0718@gmail.com
105              */
106 
107             WebElement webElement = webDriver.findElement(By.xpath("/html"));
108             String content = webElement.getAttribute("outerHTML");
109             page.setDownloadSuccess(true);
110             page.setRawText(content);
111             page.setHtml(new Html(content, request.getUrl()));
112             page.setUrl(new PlainText(request.getUrl()));
113             page.setRequest(request);
114             onSuccess(page, task);
115         } catch (Exception e) {
116             logger.warn("download page {} error", request.getUrl(), e);
117             onError(page, task, e);
118         } finally {
119             if (webDriver != null) {
120                 webDriverPool.returnToPool(webDriver);
121             }
122         }
123         return page;
124     }
125 
126     private void checkInit() {
127         if (webDriverPool == null) {
128             synchronized (this) {
129                 webDriverPool = new WebDriverPool(poolSize);
130             }
131         }
132     }
133 
134     @Override
135     public void setThread(int thread) {
136         this.poolSize = thread;
137     }
138 
139     @Override
140     public void close() throws IOException {
141         webDriverPool.closeAll();
142     }
143 }