首页 > 技术文章 > 用requests库和BeautifulSoup库爬取用户wb粉丝数量关注量以及内容

huyer 2019-12-16 19:03 原文

import requests
import re
from bs4 import BeautifulSoup
import time
from pyquery import PyQuery as pq
from urllib.parse import urlencode
import pymongo
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
class Weibo(object):
def __init__(self):
self.headers = headers
self.list = []
def get_url(self,name):
url = 'https://s.weibo.com/user?q=' + name + '&Refer=weibo_user'
return url
def get(self,url):
html = requests.get(url, headers=self.headers).text
return html
def make_soup(self,html):
soup =BeautifulSoup(html,'lxml')
return soup
def get_UID(self,soup):
p1 = soup.find(name='a', attrs={"target": "_blank"})
x = p1.attrs['href']
y = re.compile('\d+')
return y.findall(x)
def get_fans(self,list,soup):
p2 = soup.find(name='a', attrs={"href": '//weibo.com/' + list[0] + '/fans'})
return p2.text
def get_follow(self,list,soup):
p3 = soup.find(name='a', attrs={"href": '//weibo.com/' + list[0] + '/follow'})
return p3.text
def get_profile(self,list,soup):
p4 = soup.find(name='a', attrs={"href": '//weibo.com/' + list[0] + '/profile'})
return p4.text
def chucun(self,list,name,fans,follow,profile):
dict = {}
dict['姓名'] = name
dict['uid'] = list[0]
dict['粉丝'] = fans
dict['关注'] = follow
dict['微博'] = profile
return dict




class Text(object):
def __init__(self,uid,page):
self.wbjg = {}
self.uid = uid
self.list_bid = []
self.list_html = []
self.dict = {}
self.page = page
headerss = {
'Host': 'm.weibo.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Referer': 'https://m.weibo.cn/u/' + self.uid,
'X-Requested-With': 'XMLHttpRequest',
}
self.headerss = headerss
def get_containerid(self):
list_containerid = []
pares = {
'type':'uid',
'value':self.uid
}
base_url = 'https://m.weibo.cn/api/container/getIndex?'
url = base_url + urlencode(pares)
html = requests.get(url,headers = self.headerss).json()
y = html.get('data').get('tabsInfo').get('tabs')
for i in y:
list_containerid.append(i.get('containerid'))
containerids = list_containerid[1]
return containerids
def get_page(self,containerids):
pares = {
'type': 'uid',
'value': self.uid,
'containerid':containerids,
'page':self.page,
}
base_url = 'https://m.weibo.cn/api/container/getIndex?'
url = base_url + urlencode(pares)
html = requests.get(url, headers=self.headerss).json()
return html
def get_url(self,html):
list2 = []
x = html.get('data').get('cards')
for i in x:
z = i.get('mblog').get('bid')
list2.append(z)
return list2
def get_text(self,list2):
for i in list2:
pares = {
'id':i
}
new_url = 'https://m.weibo.cn/statuses/show?'+urlencode(pares)
htmls = requests.get(new_url, headers=self.headerss).json()
self.list_html.append(htmls)
time.sleep(0.5)
return self.list_html
def get_news(self):
self.list_text = []
self.list_zf = []
self.list_pl = []
self.list_dz = []
for i in self.list_html:
self.list_text.append(pq(i.get('data').get('text')).text())
self.list_zf.append(i.get('data').get('reposts_count'))
self.list_pl.append(i.get('data').get('comments_count'))
self.list_dz.append(i.get('data').get('attitudes_count'))
def print(self,i):
self.wbjg['文章'] = self.list_text[i]
self.wbjg['点赞'] = self.list_dz[i]
self.wbjg['转发'] = self.list_zf[i]
self.wbjg['评论'] = self.list_pl[i]
time.sleep(0.5)
return self.wbjg
jixu = 1
list1 = []
while True:
if int(jixu) == 1:
name = input('你想看谁的呀:')
i = Weibo()
url = i.get_url(name)
html = i.get(url)
soup = i.make_soup(html)
list = i.get_UID(soup)
fans = i.get_fans(list,soup)
follow = i.get_follow(list,soup)
profile = i.get_profile(list,soup)
print(i.chucun(list,name,fans,follow,profile))
jixu = input('你想要继续吗,还是去看微博:继续请按1,看微博请按2,退出请按3')
list1.append(list[0])
elif int(jixu) == 2:
page = 1
while page < 9:
print(page)
for i in range(0, 9):
uid = list1[0]
x = Text(uid,page)
containerids = x.get_containerid()
html = x.get_page(containerids)
list2 = x.get_url(html)
x.get_text(list2)
x.get_news()
wbjg = x.print(i)
time.sleep(0.5)
print(wbjg)
client = pymongo.MongoClient(host='localhost',port= 27017)
db = client.text
collection = db['wbjg']
result = collection.insert_many([wbjg])
page = page + 1
else:
print('完毕,退出')
break

elif int(jixu) == 3:
print('退出成功')
break
else:
print('您在开玩笑呢')
break
——2019.12.10
——By Huyer
 

推荐阅读