首页 > 技术文章 > 爬站长之家表情包

jp-mao 2017-04-24 20:50 原文

 1 from bs4 import BeautifulSoup
 2 import os
 3 import requests
 4 
 5 #获取页面内容
 6 def getHtmlText(url, s='text'):
 7     try:
 8         r = requests.get(url, timeout=30)
 9         r.raise_for_status()
10         r.encoding = r.apparent_encoding
11         if s == 'text':
12             return r.text
13         elif s == 'content':
14             return r.content
15         else:
16             return ''
17     except:
18         return ""
19 
20 
21  #获取表情包名字与表情包套链接
22 def getEmotionInfo(html):
23     soup = BeautifulSoup(html, 'html.parser')
24     emo_divs = soup.find_all('div', attrs={'class':'up'})
25     for div in emo_divs:
26         a = div.find('div', attrs={'class':'num_1'}).find('a')
27         title = a.attrs['title']
28         href = a.attrs['href']
29         getEmotionImgInfo(title, href)
30 
31 #获取表情包中每一个图片的链接
32 def getEmotionImgInfo(title, href):
33     html = getHtmlText(href)
34     soup = BeautifulSoup(html, 'html.parser')
35     img_div = soup.find('div', attrs={'class':'img_text'}).next_sibling.next_sibling
36     imgs = img_div.find_all('img')
37     url_list = []
38     for img in imgs:
39         src = img.attrs['src']
40         url_list.append(src)
41     getImg(title, url_list)
42 
43 #获取表情包保存在本地
44 def getImg(title, url_list):
45     root = 'D://pics//' + title
46     if not os.path.exists(root):
47         os.mkdir(root)
48     count_small = 0
49     for key in url_list:
50         path = root +'//'+ key.split('/')[-1]
51         if not os.path.exists(path):
52             img_content = getHtmlText(key,'content')
53             with open(path, 'wb') as f:
54                 f.write(img_content)
55             count_small = count_small + 1
56             print('\r{}文件进度:{:.2f}%'.format(title, count_small*100/len(url_list)),end=',')
57 
58 if __name__ == '__main__':
59     first_url = 'http://sc.chinaz.com/biaoqing/index.html'
60     root_url = 'http://sc.chinaz.com/biaoqing/index_'
61 
62 pages = 20 63 for i in range(1,pages): #切换页面爬取内容 64 if i == 1: 65 html = getHtmlText(first_url) 66 else: 67 url = root_url + str(i) + '.html' 68 html = getHtmlText(url) 69 getEmotionInfo(html)

 

推荐阅读