首页 > 技术文章 > python爬虫2——下载文件(中华网图片库下载)

everSeeker 2015-12-03 00:08 原文

# -*- coding: utf-8 -*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

if __name__ == '__main__':
    url = 'http://photostock.china.com.cn/Web_CHN/SpecialTopicPhoto.aspx?Id=296'
    html = requests.get(url)
    img_src = re.findall('<img alt=.*?src="..(.*?)".*?/>', html.text, re.S)
    imgUrl = []
    for each_src in img_src:
        imgUrl.append("http://photostock.china.com.cn" + each_src)
    picName = 100
    for each in imgUrl:
        imgContext = requests.get(each).content
        with open("lovelyAnimals/" + str(picName) + ".jpg", "wb") as code:
            code.write(imgContext)
        picName += 1

'''
下载文件的3种方法
(1): 使用urllib.urlretrieve方法,可在callbackfunc函数中显示下载进度
def callbackfunc(blocknum, blocksize, totalsize):
    # 回调函数
    # @blocknum:
    #     已经下载的数据块

    # @blocksize:
    #     数据块的大小

    # @totalsize:
    #     远程文件的大小
    percent = 100.0 * blocknum * blocksize / totalsize
    if percent > 100:
        percent = 100
    print "%.2f%%"% percent
url = 'http://www.sina.com.cn'
local = 'lovelyAnimals/sina.html'
urllib.urlretrieve(url, local, callbackfunc)

(2):使用urllib2.urlopen
import urllib2
url = 'http://www.sina.com.cn'
f = urllib2.urlopen(url)
data = f.read()
with open("lovelyAnimals/sina.html", "wb") as code:
    code.write(data)

(3):使用requests模块
import requests
url = 'http://www.sina.com.cn'
html = requests.get(url)
with open("lovelyAnimals/sina.html", "wb") as code:
    code.write(html.content)
'''

 

推荐阅读