大数跨境
0
0

网络爬虫实例系列 —— C++ 网页下载

网络爬虫实例系列 —— C++ 网页下载 曼昂网络爬虫
2015-09-13
1
导读:该实例通过Socket模拟实现HTTP协议的GET请求方法,实现下载网页内容的功能。头文件 download

该实例通过Socket模拟实现HTTP协议的GET请求方法,实现下载网页内容的功能。

头文件 downloader.h

#ifndef _DOWNLOADER_H_

#define _DOWNLOADER_H_


#include <sys/types.h>

#include <sys/socket.h>

#include <netinet/in.h>

#include <arpa/inet.h>

#include <netdb.h>


#include <sys/time.h>

#include <unistd.h>

#include <errno.h>


#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <netdb.h>


#include <iostream>

#include <string>

#include <stdexcept>


namespace httpget

{

class DOWNLOAD

{

public:

//downloader interface

static int DownLoader(std::string &textbuf, const std::string &uri);

private:

//separate host (domain) and path from uri

static void SepHostPath(const std::string &uri, std::string &host, std::string &path);

//resolve DNS

static int DnsResolve(const std::string &host, struct in_addr &ipaddr);

//readable data from connected server

static int Readable(int sock_id, struct timeval tv, int loops);

//control connect server time

static int TimeOut(int sock_id, struct timeval tv);

//create socket id

static int CreatSockId(int &sock_id);

//connect the web server

static int ConnectSer(int sock_id, const std::string &host, int port);

//send request information to the web server

static int SendMsg(int sock_id, const std::string &host, const std::string &path);

//get data from web server

static int RecvMsg(int sock_id, std::string &textbuf);

};

}


#endif


方法实现文件:downloader.cpp


#include "downloader.h"


namespace httpget

{

void DOWNLOAD::SepHostPath(const std::string &uri, std::string &host, std::string &path)

{

std::string::size_type pos = 0;


try

{

pos = uri.find("/", 8);

if(pos != std::string::npos)

{

host = uri.substr(7, pos-7);

path = uri.substr(pos);

}

else

{

host = uri.substr(7);

path = "/";

}

}

catch(std::out_of_range err)

{

host = "";

path = "";

}

}

int DOWNLOAD::TimeOut(int sock_fd, struct timeval tv)

{

fd_set rfdset;

int len = sizeof(int);

int error = -1;

bool ret;

FD_ZERO(&rfdset);

FD_SET(sock_fd, &rfdset);

if( select(sock_fd+1, NULL, &rfdset, NULL, &tv) > 0)

{

getsockopt(sock_fd, SOL_SOCKET, SO_ERROR, &error, (socklen_t *)&len);

if(error == 0)

ret = true;

else

ret = false;

}

else

ret = false;

if(!ret)

return -1;

else

return 1;

}

int DOWNLOAD::DnsResolve(const std::string &host, struct in_addr &ipaddr)

{

struct hostent *hp;


hp = gethostbyname(host.c_str());

if(!hp)

{

return -1;

}

ipaddr = *((struct in_addr *)hp->h_addr);


return 1;

}

int DOWNLOAD::Readable(int sock_fd, struct timeval tv, int loops)

{

int readen = 0;

int iter = 0;


fd_set rfdset;

FD_ZERO(&rfdset);

FD_SET(sock_fd, &rfdset);

for(; iter < loops; ++iter)

{

readen = select(sock_fd+1, &rfdset, NULL, NULL, &tv);

if(readen > 0 || (readen < 0 && errno != EINTR))

break;

sleep(1);

}

return readen;

}


int DOWNLOAD::CreatSockId(int &sock_fd)

{

if((sock_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)

{

return -1;

}

return 1;

}

int DOWNLOAD::ConnectSer(int sock_fd, const std::string &host, int port)

{

struct sockaddr_in serv_addr;

struct in_addr ipaddr;

int flag = 0;

struct timeval timeout;

ipaddr.s_addr = 0;

if(DnsResolve(host, ipaddr)==-1 || !(ipaddr.s_addr))

{

return -1;

}

bzero(&serv_addr, sizeof(struct sockaddr_in));

serv_addr.sin_family = AF_INET;

serv_addr.sin_port = htons(port);

serv_addr.sin_addr = ipaddr;

if((flag = connect(sock_fd, (struct sockaddr *)&serv_addr, sizeof(struct sockaddr_in))) == -1)

{

if(errno == EINPROGRESS)

{

timeout.tv_sec = 5;

timeout.tv_usec = 0;

flag = TimeOut(sock_fd, timeout);

}

}

return flag;

}


int DOWNLOAD::SendMsg(int sock_fd, const std::string &host, const std::string &path)

{

char *request = NULL;

int len = 0;

try

{

len = host.length() + path.length() + 200;

request = new char[len];

memset(request, '\0', len);

sprintf(request, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",path.c_str(), host.c_str());

strcat(request, "Accept: */*\r\n");

strcat(request, "User-Agent: Mozila/4.0(compatible; MSIE 5.0; Windows NT; DigExt; DTS Agent;)\r\n");

strcat(request, "Proxy-Connection: Keep-Alive\r\n");

strcat(request, "\r\n");

send(sock_fd, request, strlen(request), 0);

if(send(sock_fd, request, strlen(request), 0) == -1)

{

delete [] request;

request = NULL;

return -1;

}

delete [] request;

request = NULL;

return 1;

}

catch(std::bad_alloc err)

{

request = NULL;

return -1;

}

}


int DOWNLOAD::RecvMsg(int sock_fd, std::string &textbuf)

{

char *recvbuf = NULL;

try

{

struct timeval tv;

tv.tv_sec = 1;

tv.tv_usec = 0;

if(Readable(sock_fd, tv, 10) < 1)

{

return -1;

}

recvbuf = new char[10240];

memset(recvbuf, '\0', 10240);

int len = 0;

while(1)

{

if((len = recv(sock_fd, recvbuf, 10240, MSG_WAITALL)) > 0 )

{

recvbuf[len] = '\0';

textbuf += recvbuf;

memset(recvbuf, '\0', 10240);

}

else

{

break;

}

}

delete [] recvbuf;

recvbuf = NULL;

return 1;

}

catch(std::bad_alloc err)

{

recvbuf = NULL;

return -1;

}

}


int DOWNLOAD::DownLoader(std::string &textbuf, const std::string &uri)

{

std::string host(""), path("");

int sock_fd = 0;


SepHostPath(uri, host, path);

if(host.empty())

{

return -1;

}

if(CreatSockId(sock_fd) == -1)

{

return -1;

}

if(ConnectSer(sock_fd, host, 80) == -1)

{

close(sock_fd);

return -1;

}

if(SendMsg(sock_fd, host, path) == -1)

{

close(sock_fd);

return -1;

}

if(RecvMsg(sock_fd, textbuf) == -1)

{

close(sock_fd);

return -1;

}


close(sock_fd);

return 1;

}

}


测试文件:test.cpp

#include <iostream>

#include <string>

#include "downloader.h"


int main()

{

string webContent("");

string url("http://www.baidu.com");

httpget::DOWNLOAD::DownLoader(webContent, url);


cout << webContent << endl;


return 0;

}


【声明】内容源于网络
0
0
曼昂网络爬虫
我们是程序员开发者联盟,利用业余时间提供网络爬虫软件定制、微站、H5、网站等各类软件开发服务,有意者请留言!
内容 96
粉丝 0
曼昂网络爬虫 我们是程序员开发者联盟,利用业余时间提供网络爬虫软件定制、微站、H5、网站等各类软件开发服务,有意者请留言!
总阅读44
粉丝0
内容96