DAY 89 爬虫03

1 requests高级使用
    -使用代理
    -上传文件
2 自动点赞
    -模拟的很像浏览器（浏览器带什么你就带什么）
3 爬取新闻
    -requests+bs4的简单使用
    
4 bs的详细使用
    -遍历文档树：
        -.
        -标签名字
        -标签属性
        -标签的文本
    -搜索文档树
        -find   name属性标签名，attrs属性，class_,name,id
        -find_all
        -5种过滤器：字符串，正则，列表，布尔，方法
        -limit，是否递归
    -css选择器
        -soup.select('css选择器')
        
5 selenium的基本使用
    -驱动浏览器，模拟人的行为
    -相应浏览器的驱动（谷歌---》谷歌浏览器版本对应）
    -驱动放在了项目根路径，实例化得到对象的时候，指定路径

1 代理池搭建

##### 1 下载
git clone git@github.com:jhao104/proxy_pool.git
    
##### 2 安装依赖
pip install -r requirements.txt

##### 3 修改配置文件 setting.py
DB_CONN = 'redis://127.0.0.1:8888/0'

##### 4 启动爬虫，启动服务
# 启动调度程序
python proxyPool.py schedule
# 启动webApi服务
python proxyPool.py server

##### 5 访问获取代理
http://127.0.0.1:5010/get

2 selenium的基本使用（模拟登陆百度）



from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')

driver.get('https://www.baidu.com')

# 隐士等待(找一个所有控件的时候，如果控件没加载出来，等待最多10s)
driver.implicitly_wait(10)

# 通过selenium内置的查找，在页面中查找控件
# 通过a标签的模糊匹配：find_element_by_partial_link_text
# 完整a标签匹配： find_element_by_link_text
# a_logon=driver.find_element_by_partial_link_text('登录')   # a标签的内容
# a_logon=driver.find_element_by_id('s-top-loginbtn')   # a标签的内容

# a_logon=driver.find_element_by_css_selector('#s-top-loginbtn')
a_logon=driver.find_element_by_xpath('//*[@id="s-top-loginbtn"]')

# 点击该控件
a_logon.click()

# 通过id找到用户名密码登录
username_login=driver.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
# 点击
username_login.click()


# 找到用户名输入框
username=driver.find_element_by_id('TANGRAM__PSP_11__userName')
# 找到密码输入框
password=driver.find_element_by_id('TANGRAM__PSP_11__password')

# 输入框中填入内容
username.send_keys('33333@qq.com')
password.send_keys('lqz12345')

time.sleep(3)

#找到登陆按钮，点击
button=driver.find_element_by_id('TANGRAM__PSP_11__submit')
button.click()
time.sleep(3)

driver.close()



# driver.find_element_by_class_name() # 通过类名找到一个
# driver.find_element_by_name()  # 通过name属性
# driver.find_element_by_tag_name() #通过标签名
#
# driver.find_elements_by_class_name() # 通过类名找到所有
# driver.find_elements_by_name()       # 通过属性name找到所有符合的标签


# driver.find_element_by_css_selector('')  # 通过css选择器找控件

# driver.find_element_by_xpath('//*[@id="s-top-loginbtn"]')

3 无界面浏览器

1 无头浏览器使用
2 获取标签的内容  标签.text



from selenium import webdriver

# 无界面浏览器的配置
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
# chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败




# 不打开浏览器（浏览器在后台运行，浏览器界面没打开）
driver=webdriver.Chrome(executable_path='chromedriver.exe',options=chrome_options)


driver.get('https://www.cnblogs.com/')

print(driver.page_source) # 整个html的内容

# articles_list=driver.find_elements_by_css_selector('.post-item')
article_list=driver.find_elements_by_class_name('post-item')
for article in article_list:
    title=article.find_element_by_css_selector('a.post-item-title').text
    print(title)



driver.close()

4 selenium的其他用法

4.1 获取位置，属性，大小

   a=article.find_element_by_css_selector('a.post-item-title')
    title = a.text
    href=a.get_attribute('href')  # 标签对象.get_attribute('href')
    print(title)
    print(href)
    # 了解
    print(a.id)  # 这个id不是该标签的id号

    print(a.tag_name)  #标签名
    print(a.location)  # 位置  {'x': 221, 'y': 382}
    print(a.size)    #标签大小 {'height': 20, 'width': 303}

4.2 等待元素被加载

1 代码访问速度很快，页面中控件还没加载出来，如果取控件，会报错
2 两种等待方式
    -显示等待（当不知道），需要给每个控件都要加等待
wait=WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    -隐士等待（以后全用隐士等待）
    # 隐士等待(找一个所有控件的时候，如果控件没加载出来，等待最多10s)
driver.implicitly_wait(10)

4.3 元素操作

1 向input框输入值
    对象.send_keys('值')
2 点击控件
    对象.click()
3 清空input框中的值
    对象.clear()

4.4 执行js代码

# 1 执行js代码
driver.execute_script('写js即可')
# 2 常用的，页面往下滑屏幕
driver.execute_script('window.scrollBy(0,document.body.scrollHeight)')

4.5 切换选项卡

import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')



browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])

browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(1)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close() #关闭当前窗口
browser.quit()  # 关闭整个浏览器

4.6 浏览器前进后退

import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')

browser.back()  # 后退
time.sleep(3)
browser.forward() # 前进
browser.close()

4.7 异常处理

from selenium import webdriver
try:
    browser=webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    browser.switch_to.frame('iframssseResult')

except Exception as e:
    print(e)
finally:
    browser.close()

5 selenium登陆cnblogs获取cookie

from selenium import webdriver

# driver = webdriver.Chrome(executable_path='chromedriver.exe')
#
# driver.implicitly_wait(10)
# driver.get('http://www.cnblogs.com')
#
# login = driver.find_element_by_css_selector('#navbar_login_status > a:nth-child(6)')
#
# login.click()
#
# username = driver.find_element_by_id('mat-input-0')
# username.send_keys('616564099@qq.com')
#
# input('等炸')
#
# cookie = driver.get_cookies()
# import json
#
# # 把cookie保存到本地
# with open('cookie.json', 'w', encoding='utf-8') as f:
#     json.dump(cookie, f)
#
#
# driver.close()



#打开浏览器，把cookie 写入到浏览器
driver = webdriver.Chrome(executable_path='chromedriver.exe')

driver.implicitly_wait(10)
driver.get('http://www.cnblogs.com')
import json
with open('cookie.json', 'r', encoding='utf-8') as f:
    cookie=json.load(f)


for item in cookie:  #设置cookie必须用字典，cookie的json文件是列表，所以用循环往里放
    driver.add_cookie(item)

driver.refresh()

import time
time.sleep(3)
driver.refresh()

driver.close()

6 抽屉半自动点赞

from selenium import webdriver
import time
import json
# driver=webdriver.Chrome(executable_path='chromedriver.exe')
#
# driver.implicitly_wait(10)
# driver.get('https://dig.chouti.com/')
# # time.sleep(3)
# # # login=driver.find_element_by_link_text('登录')
# # login=driver.find_element_by_css_selector('body > div:nth-child(1) > div > header > div > a.btn.right.publish-btn')
# # print(login)
# # login.click()
# #
# # username=driver.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
# # password=driver.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div > input.input.pwd-input.pwd-input-active.pwd-password-input')
# # username.send_keys('18953675221')
# # password.send_keys('lqz123')
# # time.sleep(2)
# #
# # login_btn=driver.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
#
# # 有可能有验证码
#
# input('等会')
#
# cookie=driver.get_cookies()
#
# with open('chouti.json','w',encoding='utf-8') as f:
#     json.dump(cookie,f)
#
#
#
# time.sleep(2)
# driver.close()

# 交给requests模块了

import requests
from requests.cookies import RequestsCookieJar
header = {
    'Referer': 'https://dig.chouti.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
res = requests.get('https://dig.chouti.com/top/24hr?_=1621483658031', headers=header).json()
# print(res.text)


# 处理cookie
jar=RequestsCookieJar()
# 打开本地的cookie，写入cookiejar
with open('chouti.json','r',encoding='utf-8') as f:
    cookie=json.load(f)
# print(cookie)


# 注意点：制造cookiejar对象时，只要name和value
for item in cookie:
    jar.set(item['name'],item['value'])

# print(jar)



for item in res['data']:
    print(item['id'])

    url = 'https://dig.chouti.com/link/vote'
    data = {
        'linkId': item['id']
    }
    res_vode = requests.post(url, headers=header, data=data,cookies=jar)
    print(res_vode.text)

7 爬取京东商品信息

from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
import requests


def get_goods(driver):
    # ul=driver.find_element_by_class_name('gl-warp')
    li_list = driver.find_elements_by_class_name('gl-item')

    for li in li_list:
        try:
            img = li.find_element_by_css_selector('.p-img img').get_attribute('src')
            if not img:
                img = 'https:' + li.find_element_by_css_selector('.p-img img').get_attribute('data-lazy-img')

            price = li.find_element_by_css_selector('.p-price i').text
            commit = li.find_element_by_css_selector('.p-commit a').text
            href = li.find_element_by_css_selector('.p-img a').get_attribute('href')
            name = li.find_element_by_css_selector('.p-name em').text
            print('''
            商品名称：%s
            商品连接：%s
            商品图片：%s
            商品价格：%s
            商品评论数：%s
            ''' % (name, href, img, price, commit))
            # 图片保存到本地
            res = requests.get(img)
            img_name = img.rsplit('/')[-1]
            with open('img/%s' % img_name, 'wb') as f:
                for line in res.iter_content(1024):
                    f.write(line)
        except Exception as e:
            print(e)
            continue

    next = driver.find_element_by_partial_link_text('下一页')
    next.click()
    time.sleep(1)
    driver.execute_script('window.scrollBy(0,document.body.scrollHeight)')

    get_goods(driver)


driver = webdriver.Chrome(executable_path='chromedriver.exe')

driver.implicitly_wait(10)
try:
    driver.get('https://www.jd.com/')

    input_search = driver.find_element_by_id('key')
    input_search.send_keys('精品内衣', Keys.ENTER)
    # 模拟回车
    # input_search.send_keys(Keys.ENTER)

    get_goods(driver)
except Exception as e:
    print(e)
finally:
    driver.close()

8 几个爬虫案例

# 2 爬红楼梦小说

#http://www.shicimingju.com/book/hongloumeng.html

# import requests
#
# from bs4 import BeautifulSoup
# ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
# ret.encoding='utf-8'
#
# soup=BeautifulSoup(ret.text,'lxml')
# li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
# with open('hlm.txt','w',encoding='utf-8') as f:
#     for li in li_list:
#         content=li.find('a').text
#         url='https://www.shicimingju.com'+li.find('a').get('href')
#         f.write(content)
#         f.write('\n')
#         res_content=requests.get(url)
#         res_content.encoding = 'utf-8'
#         res_content.encoding=res_content.apparent_encoding
#         soup2=BeautifulSoup(res_content.text,'lxml')
#         content_detail=soup2.find(class_='chapter_content').text
#         f.write(content_detail)
#         f.write('\n')
#         print(content,'写入了')


# 3 爬肯德基门店

# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
import requests

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
data = {
    'cname': '',
    'pid': 20,
    'keyword': '浦东',
    'pageIndex': 1,
    'pageSize': 10
}
ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
print(ret.json())


# 爬取糗事百科
# 4 爬糗事百科段子

#https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)

soup=BeautifulSoup(ret.text,'html.parser')

article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
    content=article.find(class_='content').text
    print(content)
    print('-------')

1 动作链（12306），切换frame（很少了）

DAY 89 爬虫03

1 代理池搭建

2 selenium的基本使用（模拟登陆百度）

3 无界面浏览器

4 selenium的其他用法

4.1 获取位置，属性，大小

4.2 等待元素被加载

4.3 元素操作

4.4 执行js代码

4.5 切换选项卡

4.6 浏览器前进后退

4.7 异常处理

5 selenium登陆cnblogs获取cookie

6 抽屉半自动点赞

7 爬取京东商品信息

8 几个爬虫案例

推荐阅读