该示例使用HttpClient实现了HTTP协议的GET下载方法,下载页面后,根据页面返回的信息,对网页的内容重新中文编码,避免中文乱码的情况。
文件名:HttpCrawler.java
package com.mang.video.crawler;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpConnectionParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class HttpCrawler {
private String userAgent;
private int connTimeOut = 5000;
private int socketTimeOut = 5000;
private byte[] html;
private String webCharset;
public int getConnTimeOut() {
return connTimeOut;
}
public void setConnTimeOut(int connTimeOut) {
this.connTimeOut = connTimeOut;
}
public int getSocketTimeOut() {
return socketTimeOut;
}
public void setSocketTimeOut(int socketTimeOut) {
this.socketTimeOut = socketTimeOut;
}
public void init() {
userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36";
}
public void downloadPage(String baseUrl, String rUrl) {
//从网络下载网页html源码
downPageByGet(baseUrl, rUrl);
//如果是IO异常,则重试两次
int retryTimes = 0;
while(errorCode == 1004 && retryTimes < 2) {
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
++retryTimes;
downPageByGet(baseUrl, rUrl);
}
if(this.errorCode != 200) {
System.out.println("Download page by get failed: " + errorCode);
return;
}
}
private void downPageByGet(String baseUrl, String rUrl) {
this.html = null;
HttpClient client = null;
GetMethod method = new GetMethod(baseUrl);
if(rUrl != null && rUrl.trim().length() > 0) {
method.setRequestHeader("Referer", rUrl);
}
method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, charset);
method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET , charset);
method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, charset);
method.getParams().setParameter(HttpConnectionParams.CONNECTION_TIMEOUT, connTimeOut);
method.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, socketTimeOut);
method.getParams().setParameter(HttpMethodParams.USER_AGENT, userAgent);
method.setRequestHeader("Accept-Encoding", "gzip");
method.setFollowRedirects(true);
method.getParams().setParameter("http.protocol.allow-circular-redirects", true);
method.getParams().setParameter("http.protocol.max-redirects", 10);
try {
client = new HttpClient();
int result = client.executeMethod(method);
this.setHttpStatus(result);
if (200 == result) {
Header contentEncodingHeader = method.getResponseHeader("Content-Encoding");
if(contentEncodingHeader != null && contentEncodingHeader.getValue().toLowerCase().indexOf("gzip") > -1) {
InputStream inStream = method.getResponseBodyAsStream();
GZIPInputStream gzipStream = new GZIPInputStream(inStream);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
int bufSize = 1024;
byte [] htmlBuf = new byte[bufSize];
int len = 0;
while((len = gzipStream.read(htmlBuf, 0, bufSize)) != -1) {
baos.write(htmlBuf, 0, len);
}
this.html = baos.toByteArray();
gzipStream.close();
baos.close();
} else {
this.html = method.getResponseBody();
}
if(webCharset == null) {
boolean flag = false;
Header charsetTypeHeader = method.getResponseHeader("Content-Type");
if(charsetTypeHeader != null) {
String contentType = charsetTypeHeader.getValue().toLowerCase();
if(contentType.indexOf("charset=") > -1) {
Pattern pattern = Pattern.compile("charset=([^ ]+)");
Matcher matcher = pattern.matcher(contentType);
if(matcher.find()) {
webCharset = matcher.group(1);
flag = true;
}
}
}
if(!flag) {
String htmlWeb = new String(html).toLowerCase();
Pattern pattern = Pattern.compile("<meta [^>]*content=\"text/html; charset=([^\"]+)\"");
Matcher matcher = pattern.matcher(htmlWeb);
if(matcher.find()) {
webCharset = matcher.group(1);
} else {
pattern = Pattern.compile("<meta[^>]*charset=[\"]?([^>]+)[\"]?");
matcher = pattern.matcher(htmlWeb);
if(matcher.find()) {
webCharset = matcher.group(1);
}
}
}
if(webCharset != null) {
if(webCharset != "utf-8" && webCharset != "utf8") {
String tmp = new String(html, webCharset);
html = tmp.getBytes(charset);
}
}
}
errorCode = 200;
} else {
System.out.println(" HttpClient Result Status: " + result + " for url: " + baseUrl);
errorCode = result;
errorMessage = "Download page by httpClient with GET method, response status: " + result;
}
} catch (IOException e) {
errorCode = 1004;
errorMessage = "IO Exception for download page by httpClient with GET method; " + e.getMessage();
System.out.println("http Exception: " + baseUrl);
System.out.println(e.getMessage());
} finally {
if (method != null) {
method.releaseConnection();
}
}
}
}

