1 package us.codecraft.webmagic.downloader;
2
3 import java.io.IOException;
4 import java.nio.charset.Charset;
5 import java.util.HashMap;
6 import java.util.Map;
7 import java.util.Optional;
8
9 import org.apache.commons.io.IOUtils;
10 import org.apache.http.HttpResponse;
11 import org.apache.http.client.methods.CloseableHttpResponse;
12 import org.apache.http.impl.client.CloseableHttpClient;
13 import org.apache.http.util.EntityUtils;
14 import org.slf4j.Logger;
15 import org.slf4j.LoggerFactory;
16
17 import us.codecraft.webmagic.Page;
18 import us.codecraft.webmagic.Request;
19 import us.codecraft.webmagic.Site;
20 import us.codecraft.webmagic.Task;
21 import us.codecraft.webmagic.proxy.Proxy;
22 import us.codecraft.webmagic.proxy.ProxyProvider;
23 import us.codecraft.webmagic.selector.PlainText;
24 import us.codecraft.webmagic.utils.CharsetUtils;
25 import us.codecraft.webmagic.utils.HttpClientUtils;
26
27
28
29
30
31
32
33 public class HttpClientDownloader extends AbstractDownloader {
34
35 private Logger logger = LoggerFactory.getLogger(getClass());
36
37 private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
38
39 private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
40
41 private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
42
43 private ProxyProvider proxyProvider;
44
45 private boolean responseHeader = true;
46
47 public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
48 this.httpUriRequestConverter = httpUriRequestConverter;
49 }
50
51 public void setProxyProvider(ProxyProvider proxyProvider) {
52 this.proxyProvider = proxyProvider;
53 }
54
55 private CloseableHttpClient getHttpClient(Site site) {
56 if (site == null) {
57 return httpClientGenerator.getClient(null);
58 }
59 String domain = site.getDomain();
60 CloseableHttpClient httpClient = httpClients.get(domain);
61 if (httpClient == null) {
62 synchronized (this) {
63 httpClient = httpClients.get(domain);
64 if (httpClient == null) {
65 httpClient = httpClientGenerator.getClient(site);
66 httpClients.put(domain, httpClient);
67 }
68 }
69 }
70 return httpClient;
71 }
72
73 @Override
74 public Page download(Request request, Task task) {
75 if (task == null || task.getSite() == null) {
76 throw new NullPointerException("task or site can not be null");
77 }
78 CloseableHttpResponse httpResponse = null;
79 CloseableHttpClient httpClient = getHttpClient(task.getSite());
80 Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
81 HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
82 Page page = Page.fail(request);
83 try {
84 httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
85 page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
86
87 onSuccess(page, task);
88 logger.info("downloading page success {}", request.getUrl());
89
90 return page;
91 } catch (IOException e) {
92
93 onError(page, task, e);
94 logger.info("download page {} error", request.getUrl(), e);
95
96 return page;
97 } finally {
98 if (httpResponse != null) {
99
100 EntityUtils.consumeQuietly(httpResponse.getEntity());
101 }
102 if (proxyProvider != null && proxy != null) {
103 proxyProvider.returnProxy(proxy, page, task);
104 }
105 }
106 }
107
108 @Override
109 public void setThread(int thread) {
110 httpClientGenerator.setPoolSize(thread);
111 }
112
113 protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
114 byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
115 String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
116 Page page = new Page();
117 page.setBytes(bytes);
118 if (!request.isBinaryContent()) {
119 if (charset == null) {
120 charset = getHtmlCharset(contentType, bytes, task);
121 }
122 page.setCharset(charset);
123 page.setRawText(new String(bytes, charset));
124 }
125 page.setUrl(new PlainText(request.getUrl()));
126 page.setRequest(request);
127 page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
128 page.setDownloadSuccess(true);
129 if (responseHeader) {
130 page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
131 }
132 return page;
133 }
134
135 private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
136 String charset = CharsetUtils.detectCharset(contentType, contentBytes);
137 if (charset == null) {
138 charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
139 logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
140 }
141 return charset;
142 }
143 }