View Javadoc
1   package us.codecraft.webmagic.downloader;
2   
3   import java.io.IOException;
4   import java.nio.charset.Charset;
5   import java.util.HashMap;
6   import java.util.Map;
7   import java.util.Optional;
8   
9   import org.apache.commons.io.IOUtils;
10  import org.apache.http.HttpResponse;
11  import org.apache.http.client.methods.CloseableHttpResponse;
12  import org.apache.http.impl.client.CloseableHttpClient;
13  import org.apache.http.util.EntityUtils;
14  import org.slf4j.Logger;
15  import org.slf4j.LoggerFactory;
16  
17  import us.codecraft.webmagic.Page;
18  import us.codecraft.webmagic.Request;
19  import us.codecraft.webmagic.Site;
20  import us.codecraft.webmagic.Task;
21  import us.codecraft.webmagic.proxy.Proxy;
22  import us.codecraft.webmagic.proxy.ProxyProvider;
23  import us.codecraft.webmagic.selector.PlainText;
24  import us.codecraft.webmagic.utils.CharsetUtils;
25  import us.codecraft.webmagic.utils.HttpClientUtils;
26  
27  /**
28   * The http downloader based on HttpClient.
29   *
30   * @author code4crafter@gmail.com <br>
31   * @since 0.1.0
32   */
33  public class HttpClientDownloader extends AbstractDownloader {
34  
35      private Logger logger = LoggerFactory.getLogger(getClass());
36  
37      private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
38  
39      private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
40  
41      private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
42  
43      private ProxyProvider proxyProvider;
44  
45      private boolean responseHeader = true;
46  
47      public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
48          this.httpUriRequestConverter = httpUriRequestConverter;
49      }
50  
51      public void setProxyProvider(ProxyProvider proxyProvider) {
52          this.proxyProvider = proxyProvider;
53      }
54  
55      private CloseableHttpClient getHttpClient(Site site) {
56          if (site == null) {
57              return httpClientGenerator.getClient(null);
58          }
59          String domain = site.getDomain();
60          CloseableHttpClient httpClient = httpClients.get(domain);
61          if (httpClient == null) {
62              synchronized (this) {
63                  httpClient = httpClients.get(domain);
64                  if (httpClient == null) {
65                      httpClient = httpClientGenerator.getClient(site);
66                      httpClients.put(domain, httpClient);
67                  }
68              }
69          }
70          return httpClient;
71      }
72  
73      @Override
74      public Page download(Request request, Task task) {
75          if (task == null || task.getSite() == null) {
76              throw new NullPointerException("task or site can not be null");
77          }
78          CloseableHttpResponse httpResponse = null;
79          CloseableHttpClient httpClient = getHttpClient(task.getSite());
80          Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
81          HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
82          Page page = Page.fail(request);
83          try {
84              httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
85              page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
86  
87              onSuccess(page, task);
88              logger.info("downloading page success {}", request.getUrl());
89  
90              return page;
91          } catch (IOException e) {
92  
93              onError(page, task, e);
94              logger.info("download page {} error", request.getUrl(), e);
95  
96              return page;
97          } finally {
98              if (httpResponse != null) {
99                  //ensure the connection is released back to pool
100                 EntityUtils.consumeQuietly(httpResponse.getEntity());
101             }
102             if (proxyProvider != null && proxy != null) {
103                 proxyProvider.returnProxy(proxy, page, task);
104             }
105         }
106     }
107 
108     @Override
109     public void setThread(int thread) {
110         httpClientGenerator.setPoolSize(thread);
111     }
112 
113     protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
114         byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
115         String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
116         Page page = new Page();
117         page.setBytes(bytes);
118         if (!request.isBinaryContent()) {
119             if (charset == null) {
120                 charset = getHtmlCharset(contentType, bytes, task);
121             }
122             page.setCharset(charset);
123             page.setRawText(new String(bytes, charset));
124         }
125         page.setUrl(new PlainText(request.getUrl()));
126         page.setRequest(request);
127         page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
128         page.setDownloadSuccess(true);
129         if (responseHeader) {
130             page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
131         }
132         return page;
133     }
134 
135     private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
136         String charset = CharsetUtils.detectCharset(contentType, contentBytes);
137         if (charset == null) {
138             charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
139             logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
140         }
141         return charset;
142     }
143 }