HttpClientDownloader.java 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. package com.nosum.common.util;
  2. import org.apache.commons.io.IOUtils;
  3. import org.apache.http.HttpResponse;
  4. import org.apache.http.client.methods.CloseableHttpResponse;
  5. import org.apache.http.impl.client.CloseableHttpClient;
  6. import org.apache.http.util.EntityUtils;
  7. import org.slf4j.Logger;
  8. import org.slf4j.LoggerFactory;
  9. import us.codecraft.webmagic.Page;
  10. import us.codecraft.webmagic.Request;
  11. import us.codecraft.webmagic.Site;
  12. import us.codecraft.webmagic.Task;
  13. import us.codecraft.webmagic.downloader.AbstractDownloader;
  14. import us.codecraft.webmagic.downloader.HttpClientRequestContext;
  15. import us.codecraft.webmagic.downloader.HttpUriRequestConverter;
  16. import us.codecraft.webmagic.proxy.Proxy;
  17. import us.codecraft.webmagic.proxy.ProxyProvider;
  18. import us.codecraft.webmagic.selector.PlainText;
  19. import us.codecraft.webmagic.utils.CharsetUtils;
  20. import us.codecraft.webmagic.utils.HttpClientUtils;
  21. import java.io.IOException;
  22. import java.nio.charset.Charset;
  23. import java.util.HashMap;
  24. import java.util.Map;
  25. /**
  26. * @author: sumbytes
  27. * @date: 2019/8/3 14:57
  28. */
  29. public class HttpClientDownloader extends AbstractDownloader {
  30. private final Map<String, CloseableHttpClient> httpClients = new HashMap<>();
  31. private Logger logger = LoggerFactory.getLogger(getClass());
  32. private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
  33. private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
  34. private ProxyProvider proxyProvider;
  35. private boolean responseHeader = true;
  36. public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
  37. this.httpUriRequestConverter = httpUriRequestConverter;
  38. }
  39. public void setProxyProvider(ProxyProvider proxyProvider) {
  40. this.proxyProvider = proxyProvider;
  41. }
  42. private CloseableHttpClient getHttpClient(Site site) {
  43. if (site == null) {
  44. return httpClientGenerator.getClient(null);
  45. }
  46. String domain = site.getDomain();
  47. CloseableHttpClient httpClient = httpClients.get(domain);
  48. if (httpClient == null) {
  49. synchronized (this) {
  50. httpClient = httpClients.get(domain);
  51. if (httpClient == null) {
  52. httpClient = httpClientGenerator.getClient(site);
  53. httpClients.put(domain, httpClient);
  54. }
  55. }
  56. }
  57. return httpClient;
  58. }
  59. @Override
  60. public Page download(Request request, Task task) {
  61. if (task == null || task.getSite() == null) {
  62. throw new NullPointerException("task or site can not be null");
  63. }
  64. CloseableHttpResponse httpResponse = null;
  65. CloseableHttpClient httpClient = getHttpClient(task.getSite());
  66. Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
  67. HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
  68. Page page = Page.fail();
  69. try {
  70. httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
  71. page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
  72. onSuccess(request);
  73. logger.info("downloading page success {}", request.getUrl());
  74. return page;
  75. } catch (IOException e) {
  76. logger.warn("download page {} error", request.getUrl(), e);
  77. onError(request);
  78. return page;
  79. } finally {
  80. if (httpResponse != null) {
  81. //ensure the connection is released back to pool
  82. EntityUtils.consumeQuietly(httpResponse.getEntity());
  83. }
  84. if (proxyProvider != null && proxy != null) {
  85. proxyProvider.returnProxy(proxy, page, task);
  86. }
  87. }
  88. }
  89. @Override
  90. public void setThread(int thread) {
  91. httpClientGenerator.setPoolSize(thread);
  92. }
  93. protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
  94. byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
  95. String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
  96. Page page = new Page();
  97. page.setBytes(bytes);
  98. if (!request.isBinaryContent()) {
  99. if (charset == null) {
  100. charset = getHtmlCharset(contentType, bytes);
  101. }
  102. page.setCharset(charset);
  103. page.setRawText(new String(bytes, charset));
  104. }
  105. page.setUrl(new PlainText(request.getUrl()));
  106. page.setRequest(request);
  107. page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
  108. page.setDownloadSuccess(true);
  109. if (responseHeader) {
  110. page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
  111. }
  112. return page;
  113. }
  114. private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
  115. String charset = CharsetUtils.detectCharset(contentType, contentBytes);
  116. if (charset == null) {
  117. charset = Charset.defaultCharset().name();
  118. logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
  119. }
  120. return charset;
  121. }
  122. }