123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- package com.nosum.common.util;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpResponse;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.util.EntityUtils;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import us.codecraft.webmagic.Page;
- import us.codecraft.webmagic.Request;
- import us.codecraft.webmagic.Site;
- import us.codecraft.webmagic.Task;
- import us.codecraft.webmagic.downloader.AbstractDownloader;
- import us.codecraft.webmagic.downloader.HttpClientRequestContext;
- import us.codecraft.webmagic.downloader.HttpUriRequestConverter;
- import us.codecraft.webmagic.proxy.Proxy;
- import us.codecraft.webmagic.proxy.ProxyProvider;
- import us.codecraft.webmagic.selector.PlainText;
- import us.codecraft.webmagic.utils.CharsetUtils;
- import us.codecraft.webmagic.utils.HttpClientUtils;
- import java.io.IOException;
- import java.nio.charset.Charset;
- import java.util.HashMap;
- import java.util.Map;
- /**
- * @author: sumbytes
- * @date: 2019/8/3 14:57
- */
- public class HttpClientDownloader extends AbstractDownloader {
- private final Map<String, CloseableHttpClient> httpClients = new HashMap<>();
- private Logger logger = LoggerFactory.getLogger(getClass());
- private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
- private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
- private ProxyProvider proxyProvider;
- private boolean responseHeader = true;
- public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
- this.httpUriRequestConverter = httpUriRequestConverter;
- }
- public void setProxyProvider(ProxyProvider proxyProvider) {
- this.proxyProvider = proxyProvider;
- }
- private CloseableHttpClient getHttpClient(Site site) {
- if (site == null) {
- return httpClientGenerator.getClient(null);
- }
- String domain = site.getDomain();
- CloseableHttpClient httpClient = httpClients.get(domain);
- if (httpClient == null) {
- synchronized (this) {
- httpClient = httpClients.get(domain);
- if (httpClient == null) {
- httpClient = httpClientGenerator.getClient(site);
- httpClients.put(domain, httpClient);
- }
- }
- }
- return httpClient;
- }
- @Override
- public Page download(Request request, Task task) {
- if (task == null || task.getSite() == null) {
- throw new NullPointerException("task or site can not be null");
- }
- CloseableHttpResponse httpResponse = null;
- CloseableHttpClient httpClient = getHttpClient(task.getSite());
- Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
- HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
- Page page = Page.fail();
- try {
- httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
- page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
- onSuccess(request);
- logger.info("downloading page success {}", request.getUrl());
- return page;
- } catch (IOException e) {
- logger.warn("download page {} error", request.getUrl(), e);
- onError(request);
- return page;
- } finally {
- if (httpResponse != null) {
- //ensure the connection is released back to pool
- EntityUtils.consumeQuietly(httpResponse.getEntity());
- }
- if (proxyProvider != null && proxy != null) {
- proxyProvider.returnProxy(proxy, page, task);
- }
- }
- }
- @Override
- public void setThread(int thread) {
- httpClientGenerator.setPoolSize(thread);
- }
- protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
- byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
- String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
- Page page = new Page();
- page.setBytes(bytes);
- if (!request.isBinaryContent()) {
- if (charset == null) {
- charset = getHtmlCharset(contentType, bytes);
- }
- page.setCharset(charset);
- page.setRawText(new String(bytes, charset));
- }
- page.setUrl(new PlainText(request.getUrl()));
- page.setRequest(request);
- page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
- page.setDownloadSuccess(true);
- if (responseHeader) {
- page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
- }
- return page;
- }
- private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
- String charset = CharsetUtils.detectCharset(contentType, contentBytes);
- if (charset == null) {
- charset = Charset.defaultCharset().name();
- logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
- }
- return charset;
- }
- }
|