View Javadoc
1   package us.codecraft.webmagic.downloader;
2   
3   import java.io.IOException;
4   import java.nio.charset.Charset;
5   import java.util.HashMap;
6   import java.util.Map;
7   import java.util.Optional;
8   
9   import org.apache.commons.io.IOUtils;
10  import org.apache.http.HttpEntity;
11  import org.apache.http.HttpResponse;
12  import org.apache.http.client.methods.CloseableHttpResponse;
13  import org.apache.http.impl.client.CloseableHttpClient;
14  import org.apache.http.util.EntityUtils;
15  
16  import us.codecraft.webmagic.Page;
17  import us.codecraft.webmagic.Request;
18  import us.codecraft.webmagic.Site;
19  import us.codecraft.webmagic.Task;
20  import us.codecraft.webmagic.proxy.Proxy;
21  import us.codecraft.webmagic.proxy.ProxyProvider;
22  import us.codecraft.webmagic.selector.PlainText;
23  import us.codecraft.webmagic.utils.CharsetUtils;
24  import us.codecraft.webmagic.utils.HttpClientUtils;
25  
26  /**
27   * The http downloader based on HttpClient.
28   *
29   * @author code4crafter@gmail.com <br>
30   * @since 0.1.0
31   */
32  public class HttpClientDownloader extends AbstractDownloader {
33  
34      private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
35  
36      private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
37  
38      private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
39  
40      private ProxyProvider proxyProvider;
41  
42      private boolean responseHeader = true;
43  
44      public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
45          this.httpUriRequestConverter = httpUriRequestConverter;
46      }
47  
48      public void setProxyProvider(ProxyProvider proxyProvider) {
49          this.proxyProvider = proxyProvider;
50      }
51  
52      private CloseableHttpClient getHttpClient(Site site) {
53          if (site == null) {
54              return httpClientGenerator.getClient(null);
55          }
56          String domain = site.getDomain();
57          CloseableHttpClient httpClient = httpClients.get(domain);
58          if (httpClient == null) {
59              synchronized (this) {
60                  httpClient = httpClients.get(domain);
61                  if (httpClient == null) {
62                      httpClient = httpClientGenerator.getClient(site);
63                      httpClients.put(domain, httpClient);
64                  }
65              }
66          }
67          return httpClient;
68      }
69  
70      @Override
71      public Page download(Request request, Task task) {
72          if (task == null || task.getSite() == null) {
73              throw new NullPointerException("task or site can not be null");
74          }
75          CloseableHttpResponse httpResponse = null;
76          CloseableHttpClient httpClient = getHttpClient(task.getSite());
77          Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
78          HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
79          Page page = Page.fail(request);
80          try {
81              httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
82              page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
83              onSuccess(page, task);
84              return page;
85          } catch (IOException e) {
86              onError(page, task, e);
87              return page;
88          } finally {
89              if (httpResponse != null) {
90                  //ensure the connection is released back to pool
91                  EntityUtils.consumeQuietly(httpResponse.getEntity());
92              }
93              if (proxyProvider != null && proxy != null) {
94                  proxyProvider.returnProxy(proxy, page, task);
95              }
96          }
97      }
98  
99      @Override
100     public void setThread(int thread) {
101         httpClientGenerator.setPoolSize(thread);
102     }
103 
104     protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
105         HttpEntity entity = httpResponse.getEntity();
106         byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];
107         String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null;
108         Page page = new Page();
109         page.setBytes(bytes);
110         if (!request.isBinaryContent()) {
111             if (charset == null) {
112                 charset = getHtmlCharset(contentType, bytes, task);
113             }
114             page.setCharset(charset);
115             page.setRawText(new String(bytes, charset));
116         }
117         page.setUrl(new PlainText(request.getUrl()));
118         page.setRequest(request);
119         page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
120         page.setDownloadSuccess(true);
121         if (responseHeader) {
122             page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
123         }
124         return page;
125     }
126 
127     private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
128         String charset = CharsetUtils.detectCharset(contentType, contentBytes);
129         if (charset == null) {
130             charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
131         }
132         return charset;
133     }
134 }