1. 最常见爬取图片方法
1.1 urllib
'''
Signature: request.urlretrieve(url, filename=None, reporthook=None, data=None)
Docstring:
Retrieve a URL into a temporary location on disk.
Requires a URL argument. If a filename is passed, it is used as
the temporary file location. The reporthook argument should be
a callable that accepts a block number, a read size, and the
total file size of the URL target. The data argument should be
valid URL encoded data.
If a filename is passed and the URL points to a local resource,
the result is a copy from local file to new file.
Returns a tuple containing the path to the newly created
data file as well as the resulting HTTPMessage object.
File: ~/anaconda/lib/python3.6/urllib/request.py
Type: function
'''
request.urlretrieve('https://img3.doubanio.com/view/photo/photo/public/p454345512.jpg', 'kids.jpg')
import urllib
opener = request.build_opener()
headers = ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0')
opener.addheaders = [headers]
request.install_opener(opener)
request.urlretrieve('http://www.qnong.com.cn/uploadfile/2016/0416/20160416101815887.jpg', './dog.jpg')
1.2 requests
import requests
req = requests.get('http://www.qnong.com.cn/uploadfile/2016/0416/20160416101815887.jpg', stream=True)
with open('dog.jpg', 'wb') as wr:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
wr.write(chunk)
wr.flush()
2. Scrapy 支持的方法
2.1 ImagesPipeline
# settings.py
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 500
}
IMAGES_STORE = 'pictures' # 图片存储目录
IMAGES_MIN_HEIGHT = 400 # 小于600*400的图片过滤
IMAGES_MIN_WIDTH = 600
# items.py
import scrapy
class PictureItem(scrapy.Item):
image_urls = scrapy.Field()
# myspider.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import BeePicture
class PicSpider(CrawlSpider):
name = 'pic'
allowed_domains = ['qnong.com.cn']
start_urls = ['http://www.qnong.com.cn/']
rules = (
Rule(LinkExtractor(allow=r'.*?', restrict_xpaths=('//a[@href]')), callback='parse_item', follow=True),
)
def parse_item(self, response):
for img_url in response.xpath('//img/@src').extract():
item = PictureItem()
item['image_urls'] = [response.urljoin(img_url)]
yield item
2.2 自定义 Pipeline
# settings.py
ITEM_PIPELINES = {
'qnong.pipelines.MyImagesPipeline': 500,
}
# items.py
import scrapy
class PictureItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
image_paths = scrapy.Field()
# myspider.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import BeePicture
class PicSpider(CrawlSpider):
name = 'pic'
allowed_domains = ['qnong.com.cn']
start_urls = ['http://www.qnong.com.cn/']
rules = (
Rule(LinkExtractor(allow=r'.*?', restrict_xpaths=('//a[@href]')), callback='parse_item', follow=True),
)
def parse_item(self, response):
for img_url in response.xpath('//img/@src').extract():
item = PictureItem()
item['image_urls'] = [response.urljoin(img_url)]
yield item# pipelines.py
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for img_url in item['image_urls']:
yield scrapy.Request(img_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Item contains no images')
item['image_paths'] = image_paths
return item
def file_path(self, request, response=None, info=None):
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
2.3 FilesPipeline 和 ImagesPipeline 工作流程

