

从零开始用Python爬虫采集网络数据（手把手实战教程）

Python数智工坊

2025-11-11

导读：互联网上有海量数据，但如果一条条手工复制粘贴，效率太低。Python爬虫可以自动采集数据，大大提高效率。

互联网上有海量数据，但如果一条条手工复制粘贴，效率太低。Python爬虫可以自动采集数据，大大提高效率。但爬虫不是"黑科技"，也不是"非法工具"，它就是一个数据采集工具。只要遵守网站的robots.txt规则，尊重网站权益，爬虫就是完全合法的。本文将从最基础的爬虫原理开始，手把手教你写出实用的爬虫。

爬虫的核心原理

爬虫就三个步骤：

发送请求：向网站发送HTTP请求
获取内容：接收网站返回的HTML内容
解析数据：从HTML中提取需要的数据

# 爬虫三部曲演示
import requests
from bs4 import BeautifulSoup

# 第一步：发送请求
url = 'http://example.com'
response = requests.get(url)

# 第二步：获取内容
html_content = response.text

# 第三步：解析数据
soup = BeautifulSoup(html_content, 'html.parser')

实战1：爬取豆瓣电影评分

import requests
from bs4 import BeautifulSoup
import pandas as pd

# 访问豆瓣Top250
url = 'https://movie.douban.com/top250'

# 设置请求头（伪装浏览器）
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# 提取电影信息
movies_data = []

for item in soup.find_all('div', class_='item'):
    # 提取电影名
    title = item.find('span', class_='title').text
    
    # 提取评分
    rating = item.find('span', class_='rating_num').text
    
    # 提取年份和国家
    info = item.find('p', class_='').text.strip()
    
    movies_data.append({
        '电影名': title,
        '评分': rating,
        '信息': info
    })
    
    print(f"已爬取：{title}  评分：{rating}")

# 保存为Excel
df = pd.DataFrame(movies_data)
df.to_excel('豆瓣TOP250.xlsx', index=False)
print("✓ 数据已保存")

实战2：爬取天气信息

import requests
import json

# 使用天气API（不需要登录）
cities = ['Beijing', 'Shanghai', 'Guangzhou']
weather_data = []

for city in cities:
    # 使用免费的天气API
    url = f'https://api.weatherapi.com/v1/current.json'
    
    params = {
        'q': city,
        'aqi': 'yes'
    }
    
    try:
        response = requests.get(url, params=params, timeout=5)
        data = response.json()
        
        weather_info = {
            '城市': city,
            '温度': data['current']['temp_c'],
            '天气': data['current']['condition']['text'],
            '湿度': data['current']['humidity'],
        }
        
        weather_data.append(weather_info)
        print(f"{city}: {weather_info['天气']}, {weather_info['温度']}℃")
        
    except Exception as e:
        print(f"爬取{city}失败：{e}")

# 保存数据
import csv
with open('weather.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['城市', '温度', '天气', '湿度'])
    writer.writeheader()
    writer.writerows(weather_data)

实战3：爬取新闻标题和链接

import requests
from bs4 import BeautifulSoup

# 爬取新闻网站
url = 'https://news.sina.com.cn/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'

soup = BeautifulSoup(response.text, 'html.parser')

# 查找所有新闻
news_items = soup.find_all('a', class_='news-link')[:10]  # 取前10条

print("最新新闻：")
for i, item in enumerate(news_items, 1):
    title = item.text.strip()
    link = item.get('href')
    print(f"{i}. {title}")
    print(f"   链接：{link}\n")

爬虫的最佳实践和注意事项

1. 检查robots.txt

import requests

# 检查网站是否允许爬虫
url = 'https://example.com/robots.txt'
response = requests.get(url)
print(response.text)

# 根据robots.txt规则调整爬虫

2. 设置请求延迟

import time
import requests

urls = ['url1', 'url2', 'url3']

for url in urls:
    response = requests.get(url)
    # 处理数据...
    
    time.sleep(2)  # 等待2秒再发送下一个请求，避免频繁访问

3. 处理异常

import requests
from requests.exceptions import Timeout, ConnectionError

url = 'https://example.com'

try:
    response = requests.get(url, timeout=5)  # 5秒超时
    response.raise_for_status()  # 检查HTTP状态
except Timeout:
    print("请求超时")
except ConnectionError:
    print("连接错误")
except requests.exceptions.HTTPError as e:
    print(f"HTTP错误：{e}")

4. 使用Cookie处理需要登录的网站

import requests

url = 'https://example.com/login'

# 登录
login_data = {
    'username': 'your_username',
    'password': 'your_password'
}

session = requests.Session()
session.post(url, data=login_data)

# 登录后访问受保护的页面
response = session.get('https://example.com/protected')
print(response.text)