首页 > 技术文章 > python爬虫一些参考代码

gilgamesh-hjb 2017-08-07 16:43 原文

http://www.cnblogs.com/Dadio/p/5513594.html

这个是爬P站的代码,目前还没看,感觉很棒


from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
import os
import codecs
import sys
headers={
    'Accept':'text/html',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Referer':"",
    'User-Agent':"此处为浏览器的user-agent"#浏览器数据
}
order=1
def getpic (src,href,mode=""):
     os.system("cls")
     print("共有%d个文件需要下载"%number_of_file)
     if src[-3:] == "gif":
         return'''使用gif来保存静态图片的都是邪教'''
     headers['Referer'] = href
     ispng=False
     url=src.replace("_master1200","")
     url=url.replace(url[20:40],"img-original")
     if mode=='mul':
        print('正在下载第%d个...'%order)
        print("该文件含有多张图:")
     else:
        print('正在下载第%d个...'%order)
     if os.path.exists(file_path+(url.replace('/',""))[-15:]):
         print('已下载第%d个'%order)
         return
     else:
         data=requests.get(url,headers=headers,timeout=60)
     if str(data)!='<Response [200]>':
        ispng=True
        url=url.replace("jpg","png")
     if mode == 'mul':
         if ispng:
             print("********正在下载第1张")
             if os.path.exists(file_path+(url.replace('/',""))[-15:]):
                pass
             else:
                 data=requests.get(url,headers=headers)
                 im=Image.open(BytesIO(data.content))
                 im.save(file_path+(url.replace('/',""))[-15:],'png')
             print("********已下载第1张")
             for i in range(150):
                 url=url.replace("p%d.png"%i,"p%d.png"%(i+1))
                 os.system("cls")
                 print("********正在下载第%d张..."%(i+2))
                 if os.path.exists(file_path+(url.replace('/',""))[-15:]):
                    pass
                 else:
                    data=requests.get(url,headers=headers,timeout=60)
                    if str(data)!='<Response [200]>':
                        break
                    im=Image.open(BytesIO(data.content))
                    im.save(file_path+(url.replace('/',""))[-15:],'png')
                 print("********已下载第%d张"%(i+2))
         else:
             print("********正在下载第1张")
             if os.path.exists(file_path+(url.replace('/',""))[-15:]):
                pass
             else:
                 data=requests.get(url,headers=headers,timeout=60)
                 im=Image.open(BytesIO(data.content))
                 im.save(file_path+(url.replace('/',""))[-15:],'jpeg')
             print("********已下载第1张")
             for i in range(150):
                 url=url.replace("p%d.jpg"%i,"p%d.jpg"%(i+1))
                 os.system("cls")
                 print("********正在下载第%d张..."%(i+2))
                 if os.path.exists(file_path+(url.replace('/',""))[-15:]):
                     pass
                 else:
                     data=requests.get(url,headers=headers,timeout=60)
                     if str(data)!='<Response [200]>':
                         break
                     im=Image.open(BytesIO(data.content))
                     im.save(file_path+(url.replace('/',""))[-15:],'jpeg')
                 print("********已下载第%d张"%(i+2))
     else:
         if ispng :
            if os.path.exists(file_path+(url.replace('/',""))[-15:]):
                print('已下载第%d个'%order)
                return
            else:
                data=requests.get(url,headers=headers,timeout=60)
                if str(data) == '<Response [200]>':
                 im=Image.open(BytesIO(data.content))
                 im.save(file_path+(url.replace('/',""))[-15:],'png')
                print('已下载第%d个'%order)
         else:
            im=Image.open(BytesIO(data.content))
            im.save(file_path+(url.replace('/',""))[-15:],'jpeg')
            print('已下载第%d个'%order)

number=sys.argv[1]
file_path=sys.argv[2]+'\\Picture\\'#修改此处即可改变路径
url_save="http://spotlight.pics/zh/a/%s"%number
wb=requests.get(url_save,headers=headers)
wb_data=BeautifulSoup(wb.text,'lxml')
title=wb_data.h2.string.replace("\n","").replace(":","").replace("?","").replace("\"","").replace(" ","")
title=title.replace("<","").replace(">","").replace("|","").replace("*","").replace("/","").replace("\\","")
#依据windows目录命名规则
file_path=(file_path+title)+"\\"
'''判断文件是否存在'''
if not os.path.exists(file_path):
    introduce=str(wb_data.h2.next_sibling.next_sibling.next_element)
    os.mkdir(file_path)
    f=codecs.open(file_path+"介绍.txt","w","utf-8")
    f.write("特辑号:%s\n"%number+introduce)
    f.close()
divs=wb_data.body.select('div[class="illust-wrap"]')
number_of_file=len(divs)
headers['Accept']='image/webp,image/*,*/*;q=0.8'
for div in divs:
  if str(div.a.parent['class'])!='[\'ugoira-player\', \'ui-scroll-view\']':
    if str(div.a.parent['class'])=='[\'illust-multi-page-wrap\']':
        getpic(div.img['src'],div.a['href'],"mul")
    else:
        getpic(div.img['src'],div.a['href'])
    #要想好动图怎么办
  order+=1

 

推荐阅读