1 package us.codecraft.webmagic.downloader;
2
3 import java.io.IOException;
4 import java.nio.charset.Charset;
5 import java.util.HashMap;
6 import java.util.Map;
7 import java.util.Optional;
8
9 import org.apache.commons.io.IOUtils;
10 import org.apache.http.HttpEntity;
11 import org.apache.http.HttpResponse;
12 import org.apache.http.client.methods.CloseableHttpResponse;
13 import org.apache.http.impl.client.CloseableHttpClient;
14 import org.apache.http.util.EntityUtils;
15
16 import us.codecraft.webmagic.Page;
17 import us.codecraft.webmagic.Request;
18 import us.codecraft.webmagic.Site;
19 import us.codecraft.webmagic.Task;
20 import us.codecraft.webmagic.proxy.Proxy;
21 import us.codecraft.webmagic.proxy.ProxyProvider;
22 import us.codecraft.webmagic.selector.PlainText;
23 import us.codecraft.webmagic.utils.CharsetUtils;
24 import us.codecraft.webmagic.utils.HttpClientUtils;
25
26
27
28
29
30
31
32 public class HttpClientDownloader extends AbstractDownloader {
33
34 private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
35
36 private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
37
38 private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
39
40 private ProxyProvider proxyProvider;
41
42 private boolean responseHeader = true;
43
44 public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
45 this.httpUriRequestConverter = httpUriRequestConverter;
46 }
47
48 public void setProxyProvider(ProxyProvider proxyProvider) {
49 this.proxyProvider = proxyProvider;
50 }
51
52 private CloseableHttpClient getHttpClient(Site site) {
53 if (site == null) {
54 return httpClientGenerator.getClient(null);
55 }
56 String domain = site.getDomain();
57 CloseableHttpClient httpClient = httpClients.get(domain);
58 if (httpClient == null) {
59 synchronized (this) {
60 httpClient = httpClients.get(domain);
61 if (httpClient == null) {
62 httpClient = httpClientGenerator.getClient(site);
63 httpClients.put(domain, httpClient);
64 }
65 }
66 }
67 return httpClient;
68 }
69
70 @Override
71 public Page download(Request request, Task task) {
72 if (task == null || task.getSite() == null) {
73 throw new NullPointerException("task or site can not be null");
74 }
75 CloseableHttpResponse httpResponse = null;
76 CloseableHttpClient httpClient = getHttpClient(task.getSite());
77 Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
78 HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
79 Page page = Page.fail(request);
80 try {
81 httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
82 page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
83 onSuccess(page, task);
84 return page;
85 } catch (IOException e) {
86 onError(page, task, e);
87 return page;
88 } finally {
89 if (httpResponse != null) {
90
91 EntityUtils.consumeQuietly(httpResponse.getEntity());
92 }
93 if (proxyProvider != null && proxy != null) {
94 proxyProvider.returnProxy(proxy, page, task);
95 }
96 }
97 }
98
99 @Override
100 public void setThread(int thread) {
101 httpClientGenerator.setPoolSize(thread);
102 }
103
104 protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
105 HttpEntity entity = httpResponse.getEntity();
106 byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];
107 String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null;
108 Page page = new Page();
109 page.setBytes(bytes);
110 if (!request.isBinaryContent()) {
111 if (charset == null) {
112 charset = getHtmlCharset(contentType, bytes, task);
113 }
114 page.setCharset(charset);
115 page.setRawText(new String(bytes, charset));
116 }
117 page.setUrl(new PlainText(request.getUrl()));
118 page.setRequest(request);
119 page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
120 page.setDownloadSuccess(true);
121 if (responseHeader) {
122 page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
123 }
124 return page;
125 }
126
127 private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
128 String charset = CharsetUtils.detectCharset(contentType, contentBytes);
129 if (charset == null) {
130 charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
131 }
132 return charset;
133 }
134 }