首页 > 技术文章 > 爬虫 selenium

zhangqing979797 2019-02-28 21:51 原文

点击这里有惊喜

https://www.cnblogs.com/bobo-zhang/p/9685362.html

 

安装selenium

环境安装 pip install selenium
编码流程:
导报 from selenium import webdriver
实例化某一款浏览器对象
自指定自动化操作代码

 

使用后,会自动化跳转一个Chrome页面,应进行操作

from selenium import webdriver
from time import sleep

bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据\day_03_爬虫\chromedriver.exe') bro.get(url='https://www.baidu.com/') sleep(2) text_input = bro.find_element_by_id('kw') text_input.send_keys('人民币') sleep(2) bro.find_element_by_id('su').click() sleep(3) #获取当前的页面源码数据 print(bro.page_source) bro.quit()

 

 

通过selenium 获取 数据

#获取豆瓣电影中更多电影详情数据
url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据\day_03_爬虫\chromedriver.exe')
bro.get(url)
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
page_text = bro.page_source

with open('./douban.html','w',encoding='utf-8') as fp:
    fp.write(page_text)

sleep(1)
bro.quit()

 

关于谷歌无头浏览器

什么是无头浏览器?
使用后不会弹出页面,也就是没有页面

#谷歌无头浏览器
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

#获取豆瓣电影中更多电影详情数据
url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据\day_03_爬虫\chromedriver.exe',chrome_options=chrome_options)
bro.get(url)
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(3)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
page_text = bro.page_source

with open('./douban.html','w',encoding='utf-8') as fp:
    fp.write(page_text)
print(page_text)
sleep(1)
bro.quit()

 

 

通过selenium自动话登录qq空间

#qq空间
bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据\day_03_爬虫\chromedriver.exe')
url = 'https://qzone.qq.com/'
bro.get(url=url)
sleep(2)
#定位到一个具体的iframe
bro.switch_to.frame('login_frame')
bro.find_element_by_id('switcher_plogin').click()
sleep(2)

bro.find_element_by_id('u').send_keys('460086804')
bro.find_element_by_id('p').send_keys('shuo0127')

bro.find_element_by_id('login_button').click()

sleep(5)

page_text = bro.page_source
with open('qq.html','w',encoding='utf-8') as fp:
    fp.write(page_text)
bro.quit()

 

 

线程池

#爬取梨视频数据
import requests
import re
from lxml import etree
from multiprocessing.dummy import Pool
import random

#实例化一个线程池对象
pool = Pool(5)
url = 'https://www.pearvideo.com/category_1'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')

video_url_list = []
for li in li_list:
    detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
    detail_page = requests.get(url=detail_url,headers=headers).text
    video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0]
    video_url_list.append(video_url)
    
video_data_list = pool.map(getVideoData,video_url_list)

pool.map(saveVideo,video_data_list)

 

def getVideoData(url):
    return requests.get(url=url,headers=headers).content

def saveVideo(data):
    fileName = str(random.randint(0,5000))+'.mp4'
    with open(fileName,'wb') as fp:
        fp.write(data)

 

推荐阅读