http://www.cnblogs.com/Dadio/p/5513594.html
这个是爬P站的代码,目前还没看,感觉很棒
from bs4 import BeautifulSoup import requests from PIL import Image from io import BytesIO import os import codecs import sys headers={ 'Accept':'text/html', 'Accept-Language':'zh-CN,zh;q=0.8', 'Referer':"", 'User-Agent':"此处为浏览器的user-agent"#浏览器数据 } order=1 def getpic (src,href,mode=""): os.system("cls") print("共有%d个文件需要下载"%number_of_file) if src[-3:] == "gif": return'''使用gif来保存静态图片的都是邪教''' headers['Referer'] = href ispng=False url=src.replace("_master1200","") url=url.replace(url[20:40],"img-original") if mode=='mul': print('正在下载第%d个...'%order) print("该文件含有多张图:") else: print('正在下载第%d个...'%order) if os.path.exists(file_path+(url.replace('/',""))[-15:]): print('已下载第%d个'%order) return else: data=requests.get(url,headers=headers,timeout=60) if str(data)!='<Response [200]>': ispng=True url=url.replace("jpg","png") if mode == 'mul': if ispng: print("********正在下载第1张") if os.path.exists(file_path+(url.replace('/',""))[-15:]): pass else: data=requests.get(url,headers=headers) im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace('/',""))[-15:],'png') print("********已下载第1张") for i in range(150): url=url.replace("p%d.png"%i,"p%d.png"%(i+1)) os.system("cls") print("********正在下载第%d张..."%(i+2)) if os.path.exists(file_path+(url.replace('/',""))[-15:]): pass else: data=requests.get(url,headers=headers,timeout=60) if str(data)!='<Response [200]>': break im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace('/',""))[-15:],'png') print("********已下载第%d张"%(i+2)) else: print("********正在下载第1张") if os.path.exists(file_path+(url.replace('/',""))[-15:]): pass else: data=requests.get(url,headers=headers,timeout=60) im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace('/',""))[-15:],'jpeg') print("********已下载第1张") for i in range(150): url=url.replace("p%d.jpg"%i,"p%d.jpg"%(i+1)) os.system("cls") print("********正在下载第%d张..."%(i+2)) if os.path.exists(file_path+(url.replace('/',""))[-15:]): pass else: data=requests.get(url,headers=headers,timeout=60) if str(data)!='<Response [200]>': break im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace('/',""))[-15:],'jpeg') print("********已下载第%d张"%(i+2)) else: if ispng : if os.path.exists(file_path+(url.replace('/',""))[-15:]): print('已下载第%d个'%order) return else: data=requests.get(url,headers=headers,timeout=60) if str(data) == '<Response [200]>': im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace('/',""))[-15:],'png') print('已下载第%d个'%order) else: im=Image.open(BytesIO(data.content)) im.save(file_path+(url.replace('/',""))[-15:],'jpeg') print('已下载第%d个'%order) number=sys.argv[1] file_path=sys.argv[2]+'\\Picture\\'#修改此处即可改变路径 url_save="http://spotlight.pics/zh/a/%s"%number wb=requests.get(url_save,headers=headers) wb_data=BeautifulSoup(wb.text,'lxml') title=wb_data.h2.string.replace("\n","").replace(":","").replace("?","").replace("\"","").replace(" ","") title=title.replace("<","").replace(">","").replace("|","").replace("*","").replace("/","").replace("\\","") #依据windows目录命名规则 file_path=(file_path+title)+"\\" '''判断文件是否存在''' if not os.path.exists(file_path): introduce=str(wb_data.h2.next_sibling.next_sibling.next_element) os.mkdir(file_path) f=codecs.open(file_path+"介绍.txt","w","utf-8") f.write("特辑号:%s\n"%number+introduce) f.close() divs=wb_data.body.select('div[class="illust-wrap"]') number_of_file=len(divs) headers['Accept']='image/webp,image/*,*/*;q=0.8' for div in divs: if str(div.a.parent['class'])!='[\'ugoira-player\', \'ui-scroll-view\']': if str(div.a.parent['class'])=='[\'illust-multi-page-wrap\']': getpic(div.img['src'],div.a['href'],"mul") else: getpic(div.img['src'],div.a['href']) #要想好动图怎么办 order+=1