大数跨境
0
0

网络爬虫实例系列 —— HttpClient 网络爬虫

网络爬虫实例系列 —— HttpClient 网络爬虫 曼昂网络爬虫
2015-09-11
2
导读:该示例使用HttpClient实现了HTTP协议的GET下载方法,下载页面后,根据页面返回的信息,对网页的内

该示例使用HttpClient实现了HTTP协议的GET下载方法,下载页面后,根据页面返回的信息,对网页的内容重新中文编码,避免中文乱码的情况。

文件名:HttpCrawler.java


package com.mang.video.crawler;


import java.io.ByteArrayOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import java.util.zip.GZIPInputStream;


import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.params.HttpConnectionParams;

import org.apache.commons.httpclient.params.HttpMethodParams;


public class HttpCrawler {


private String userAgent;

private int connTimeOut = 5000;

private int socketTimeOut = 5000;

private byte[] html;

private String webCharset;


public int getConnTimeOut() {

return connTimeOut;

}


public void setConnTimeOut(int connTimeOut) {

this.connTimeOut = connTimeOut;

}


public int getSocketTimeOut() {

return socketTimeOut;

}


public void setSocketTimeOut(int socketTimeOut) {

this.socketTimeOut = socketTimeOut;

}

public void init() {

userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36";

}


public void downloadPage(String baseUrl, String rUrl) {

//从网络下载网页html源码

downPageByGet(baseUrl, rUrl);

//如果是IO异常,则重试两次

int retryTimes = 0;

while(errorCode == 1004 && retryTimes < 2) {

try {

Thread.sleep(2000);

} catch (InterruptedException e) {

e.printStackTrace();

}

++retryTimes;

downPageByGet(baseUrl, rUrl);

}

if(this.errorCode != 200) {

System.out.println("Download page by get failed: " + errorCode);

return;

}

}

private void downPageByGet(String baseUrl, String rUrl) {

this.html = null;

HttpClient client = null;

GetMethod method = new GetMethod(baseUrl);

if(rUrl != null && rUrl.trim().length() > 0) {

method.setRequestHeader("Referer", rUrl);

}


method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, charset);

method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET , charset);

method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, charset);

method.getParams().setParameter(HttpConnectionParams.CONNECTION_TIMEOUT, connTimeOut);

method.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, socketTimeOut);

method.getParams().setParameter(HttpMethodParams.USER_AGENT, userAgent);

method.setRequestHeader("Accept-Encoding", "gzip");

method.setFollowRedirects(true);

method.getParams().setParameter("http.protocol.allow-circular-redirects", true);

method.getParams().setParameter("http.protocol.max-redirects", 10);

try {

client = new HttpClient();

int result = client.executeMethod(method);

this.setHttpStatus(result);

if (200 == result) {

Header contentEncodingHeader = method.getResponseHeader("Content-Encoding");

if(contentEncodingHeader != null && contentEncodingHeader.getValue().toLowerCase().indexOf("gzip") > -1) {

InputStream inStream = method.getResponseBodyAsStream();

GZIPInputStream gzipStream = new GZIPInputStream(inStream);

ByteArrayOutputStream baos = new ByteArrayOutputStream();

int bufSize = 1024;

byte [] htmlBuf = new byte[bufSize];

int len = 0;

while((len = gzipStream.read(htmlBuf, 0, bufSize)) != -1) {

baos.write(htmlBuf, 0, len);

}

this.html = baos.toByteArray();

gzipStream.close();

baos.close();

} else {

this.html = method.getResponseBody();

}

if(webCharset == null) {

boolean flag = false;

Header charsetTypeHeader = method.getResponseHeader("Content-Type");

if(charsetTypeHeader != null) {

String contentType = charsetTypeHeader.getValue().toLowerCase();

if(contentType.indexOf("charset=") > -1) {

Pattern pattern = Pattern.compile("charset=([^ ]+)");

Matcher matcher = pattern.matcher(contentType);

if(matcher.find()) {

webCharset = matcher.group(1);

flag = true;

}

}

}

if(!flag) {

String htmlWeb = new String(html).toLowerCase();

Pattern pattern = Pattern.compile("<meta [^>]*content=\"text/html; charset=([^\"]+)\"");

Matcher matcher = pattern.matcher(htmlWeb);

if(matcher.find()) {

webCharset = matcher.group(1);

} else {

pattern = Pattern.compile("<meta[^>]*charset=[\"]?([^>]+)[\"]?");

matcher = pattern.matcher(htmlWeb);

if(matcher.find()) {

webCharset = matcher.group(1);

}

}

}

if(webCharset != null) {

if(webCharset != "utf-8" && webCharset != "utf8") {

String tmp = new String(html, webCharset);

html = tmp.getBytes(charset);

}

}

}

errorCode = 200;

} else {

System.out.println(" HttpClient Result Status: " + result + " for url: " + baseUrl);

errorCode = result;

errorMessage = "Download page by httpClient with GET method, response status: " + result;

}


} catch (IOException e) {

errorCode = 1004;

errorMessage = "IO Exception for download page by httpClient with GET method; " + e.getMessage();

System.out.println("http Exception: " + baseUrl);

System.out.println(e.getMessage());

} finally {

if (method != null) {

method.releaseConnection();

}

}

}

}


【声明】内容源于网络
0
0
曼昂网络爬虫
我们是程序员开发者联盟,利用业余时间提供网络爬虫软件定制、微站、H5、网站等各类软件开发服务,有意者请留言!
内容 96
粉丝 0
曼昂网络爬虫 我们是程序员开发者联盟,利用业余时间提供网络爬虫软件定制、微站、H5、网站等各类软件开发服务,有意者请留言!
总阅读44
粉丝0
内容96