

网络爬虫实例系列 —— HttpClient 网络爬虫

曼昂网络爬虫

2015-09-11

导读：该示例使用HttpClient实现了HTTP协议的GET下载方法，下载页面后，根据页面返回的信息，对网页的内

该示例使用HttpClient实现了HTTP协议的GET下载方法，下载页面后，根据页面返回的信息，对网页的内容重新中文编码，避免中文乱码的情况。

文件名：HttpCrawler.java

package com.mang.video.crawler;

import java.io.ByteArrayOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import java.util.zip.GZIPInputStream;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.params.HttpConnectionParams;

import org.apache.commons.httpclient.params.HttpMethodParams;

public class HttpCrawler {

private String userAgent;

private int connTimeOut = 5000;

private int socketTimeOut = 5000;

private byte[] html;

private String webCharset;

public int getConnTimeOut() {

return connTimeOut;

}

public void setConnTimeOut(int connTimeOut) {

this.connTimeOut = connTimeOut;

}

public int getSocketTimeOut() {

return socketTimeOut;

}

public void setSocketTimeOut(int socketTimeOut) {

this.socketTimeOut = socketTimeOut;

}

public void init() {

userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36";

}

public void downloadPage(String baseUrl, String rUrl) {

//从网络下载网页html源码

downPageByGet(baseUrl, rUrl);

//如果是IO异常，则重试两次

int retryTimes = 0;

while(errorCode == 1004 && retryTimes < 2) {

try {

Thread.sleep(2000);

} catch (InterruptedException e) {

e.printStackTrace();

}

++retryTimes;

downPageByGet(baseUrl, rUrl);

}

if(this.errorCode != 200) {

System.out.println("Download page by get failed: " + errorCode);

return;

}

private void downPageByGet(String baseUrl, String rUrl) {

this.html = null;

HttpClient client = null;

GetMethod method = new GetMethod(baseUrl);

if(rUrl != null && rUrl.trim().length() > 0) {

method.setRequestHeader("Referer", rUrl);

}

method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, charset);

method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET , charset);

method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, charset);

method.getParams().setParameter(HttpConnectionParams.CONNECTION_TIMEOUT, connTimeOut);

method.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, socketTimeOut);

method.getParams().setParameter(HttpMethodParams.USER_AGENT, userAgent);

method.setRequestHeader("Accept-Encoding", "gzip");

method.setFollowRedirects(true);

method.getParams().setParameter("http.protocol.allow-circular-redirects", true);

method.getParams().setParameter("http.protocol.max-redirects", 10);

try {

client = new HttpClient();

int result = client.executeMethod(method);

this.setHttpStatus(result);

if (200 == result) {

Header contentEncodingHeader = method.getResponseHeader("Content-Encoding");

if(contentEncodingHeader != null && contentEncodingHeader.getValue().toLowerCase().indexOf("gzip") > -1) {

InputStream inStream = method.getResponseBodyAsStream();

GZIPInputStream gzipStream = new GZIPInputStream(inStream);

ByteArrayOutputStream baos = new ByteArrayOutputStream();

int bufSize = 1024;

byte [] htmlBuf = new byte[bufSize];

int len = 0;

while((len = gzipStream.read(htmlBuf, 0, bufSize)) != -1) {

baos.write(htmlBuf, 0, len);

}

this.html = baos.toByteArray();

gzipStream.close();

baos.close();

} else {

this.html = method.getResponseBody();

}

if(webCharset == null) {

boolean flag = false;

Header charsetTypeHeader = method.getResponseHeader("Content-Type");

if(charsetTypeHeader != null) {

String contentType = charsetTypeHeader.getValue().toLowerCase();

if(contentType.indexOf("charset=") > -1) {

Pattern pattern = Pattern.compile("charset=([^ ]+)");

Matcher matcher = pattern.matcher(contentType);

if(matcher.find()) {

webCharset = matcher.group(1);

flag = true;

}

if(!flag) {

String htmlWeb = new String(html).toLowerCase();

Pattern pattern = Pattern.compile("<meta [^>]*content=\"text/html; charset=([^\"]+)\"");

Matcher matcher = pattern.matcher(htmlWeb);

if(matcher.find()) {

webCharset = matcher.group(1);

} else {

pattern = Pattern.compile("<meta[^>]*charset=[\"]?([^>]+)[\"]?");

matcher = pattern.matcher(htmlWeb);

if(matcher.find()) {

webCharset = matcher.group(1);

}

if(webCharset != null) {

if(webCharset != "utf-8" && webCharset != "utf8") {

String tmp = new String(html, webCharset);

html = tmp.getBytes(charset);

}

errorCode = 200;

} else {

System.out.println(" HttpClient Result Status: " + result + " for url: " + baseUrl);

errorCode = result;

errorMessage = "Download page by httpClient with GET method, response status: " + result;

}

} catch (IOException e) {

errorCode = 1004;

errorMessage = "IO Exception for download page by httpClient with GET method; " + e.getMessage();

System.out.println("http Exception: " + baseUrl);

System.out.println(e.getMessage());

} finally {

if (method != null) {

method.releaseConnection();

}

【声明】内容源于网络

曼昂网络爬虫

我们是程序员开发者联盟，利用业余时间提供网络爬虫软件定制、微站、H5、网站等各类软件开发服务，有意者请留言！

内容 96

粉丝 0

曼昂网络爬虫我们是程序员开发者联盟，利用业余时间提供网络爬虫软件定制、微站、H5、网站等各类软件开发服务，有意者请留言！

总阅读44

粉丝0

内容96