selenium的使用

1、概念

- 概念：基于浏览器自动化的一个模块。
- 环境的安装：
    - pip install selenium
- selenium和爬虫之间的关联：
    - 模拟登录
    - 便捷的捕获到动态加载的数据（重点）
        - 特点：可见及可得
        - 缺点：效率低

- selenium的具体使用
    - 准备浏览器的驱动程序：http://chromedriver.storage.googleapis.com/index.html


- 动作链：ActionChains，一系列的行为动作
    - 使用流程：
        - 实例化一个动作连对象，需要将指定的浏览器和动作连对象进行绑定
        - 执行相关的连续的动作
        - perform()立即执行动作连制定好的动作


- 12306模拟登录分析：
    - 验证码的的处理：


- selenium规避风险
    - 正经打开一个网站进行window.navigator.webdriver的js注入，返回值为undefined
    - 使用selenium打开的页面，进行上述js注入返回的是true
- 无头浏览器
    - phantomJs
    - 谷歌无头

2、selenium的演示程序

from selenium import webdriver
from time import sleep

# 后面是你的浏览器驱动位置，记得前面加r'','r'是防止字符转义的
driver = webdriver.Chrome(r'chromedriver')
# 用get打开百度页面
driver.get("http://www.baidu.com")
# 查找页面的“设置”选项，并进行点击
driver.find_elements_by_link_text('设置')[0].click()
sleep(2)
# # 打开设置后找到“搜索设置”选项，设置为每页显示50条
driver.find_elements_by_link_text('搜索设置')[0].click()
sleep(2)

# 选中每页显示50条
m = driver.find_element_by_id('nr')
sleep(2)
m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()
m.find_element_by_xpath('.//option[3]').click()
sleep(2)

# 点击保存设置
driver.find_elements_by_class_name("prefpanelgo")[0].click()
sleep(2)

# 处理弹出的警告页面   确定accept() 和 取消dismiss()
driver.switch_to_alert().accept()
sleep(2)
# 找到百度的输入框，并输入 美女
driver.find_element_by_id('kw').send_keys('美女')
sleep(2)
# 点击搜索按钮
driver.find_element_by_id('su').click()
sleep(2)
# 在打开的页面中找到“Selenium - 开源中国社区”，并打开这个页面
driver.find_elements_by_link_text('美女_百度图片')[0].click()
sleep(3)

# 关闭浏览器
driver.quit()

3、selenium的基本使用

from selenium import webdriver
from time import sleep
#结合着浏览去的驱动实例化一个浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver.exe')

#请求的发送
url = 'https://www.jd.com/'
bro.get(url)
sleep(1)
#标签定位
# bro.find_element_by_xpath('//input[@id="key"]')
search = bro.find_element_by_id('key')
search.send_keys('mac pro')#向指定标签中录入文本数据
sleep(2)
btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
sleep(2)
#JS注入 移动 浏览器的 滚轮
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')

#捕获到当前页面的数据
page_text = bro.page_source
print(page_text)
sleep(3)
# 关闭浏览器
bro.quit()

4、动态加载数据的捕获

#http://125.35.6.84:81/xk/,将药监总局前3页的企业名称进行爬取
from selenium import webdriver
from lxml import etree
from time import sleep
##结合着浏览去的驱动实例化一个浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# url
url = 'http://125.35.6.84:81/xk/'
# 访问该url
bro.get(url)
# 获取该页的 文本信息
page_text = bro.page_source
# 添加到列表中
all_page_text = [page_text]
#点击下一页
for i in range(2):
    # 定位到下一页的元素
    nextPage = bro.find_element_by_xpath('//*[@id="pageIto_next"]')
    # 点击下一页
    nextPage.click()
    sleep(1)
    # 将下一页的文本添加到 列表中
    all_page_text.append(bro.page_source)
# 循环列表
for page_text in all_page_text:
    # 生成tree对象
    tree = etree.HTML(page_text)
    # 获取该元素
    li_list = tree.xpath('//*[@id="gzlist"]/li')
    # 循环 li
    for li in li_list:
        # 分别取出li的值
        name = li.xpath('./dl/@title')[0]
        print(name)


sleep(2)
# 关闭浏览器
bro.quit()

5、动作链

from selenium import webdriver
from selenium.webdriver import ActionChains#动作连
from time import sleep
bro = webdriver.Chrome(executable_path='./chromedriver.exe')

url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'

bro.get(url)
#NoSuchElementException:定位的标签是存在与iframe之中，则就会抛出这个错误
#解决方法：switch_to.frame进行指定子页面的切换
bro.switch_to.frame('iframeResult')
div_tag = bro.find_element_by_xpath('//*[@id="draggable"]')

#实例化一个动作连对象
action = ActionChains(bro)
action.click_and_hold(div_tag)#点击且长按

#perform()让动作连立即执行
for i in range(5):
    action.move_by_offset(xoffset=15,yoffset=15).perform()
    sleep(2)
action.release()
sleep(5)
bro.quit()

6、12306模拟登陆

from selenium import webdriver
from selenium.webdriver import ActionChains
from time import sleep
from PIL import Image #安装PIL或者是Pillow
from CJY import Chaojiying_Client

#封装一个识别验证码的函数
def transformCode(imgPath,imgType):
    chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')
    im = open(imgPath, 'rb').read()
    return chaojiying.PostPic(im, imgType)['pic_str']


bro = webdriver.Chrome(executable_path='./chromedriver.exe')

bro.get('https://kyfw.12306.cn/otn/login/init')
sleep(2)
#将当前浏览器页面进行图片保存
bro.save_screenshot('./main.png')
#将验证码的局部区域进行裁剪
#捕获标签在页面中的位置信息
img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = img_tag.location#标签的起始位置坐标（左下角坐标）
size = img_tag.size#标签的尺寸
#裁剪范围对应的矩形区域
rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
#使用Image工具进行指定区域的裁剪
i = Image.open('./main.png')
frame = i.crop(rangle)#crop就是根据指定的裁剪范围进行图片的截取
frame.save('code.png')

#调用打码平台进行验证码的识别
result = transformCode('./code.png',9004)
print(result) #x1,y1|x2,y2|x3,y3

#x1,y1|x2,y2|x3,y3 ==>[[x1,y1],[x2,y2],[x3,y3]]
all_list = []#[[x1,y1],[x2,y2],[x3,y3]]
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)


for point in all_list:
    x = point[0]
    y = point[1]
    ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()
    sleep(1)


bro.find_element_by_id('username').send_keys('xxxxxx')
sleep(1)
bro.find_element_by_id('password').send_keys('xxxx')
sleep(1)

bro.find_element_by_id('loginSub').click()

sleep(10)
print(bro.page_source)
bro.quit()

上边的代码中用到了一个模块，这个模块是超级鹰的一个验证码识别的模块；需要在超级鹰网站进行购买获取；代码如下：

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

下边的代码是将这个类实例化封装到一个函数中：

#封装一个识别验证码的函数
def transformCode(imgPath,imgType):
    chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')
    im = open(imgPath, 'rb').read()
    return chaojiying.PostPic(im, imgType)['pic_str']

7、selenium规避风险

#正经打开一个网站进行window.navigator.webdriver的js注入，返回值为undefined
#使用selenium打开的页面，进行上述js注入返回的是true
#代码实现：
# 规避检测
from selenium import webdriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option)

url = 'https://www.taobao.com/'

bro.get(url)

8、无头浏览器

#phantomJs

#谷歌无头

#代码实现：
#无头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options)
url = 'https://www.taobao.com/'
bro.get(url)
sleep(2)
bro.save_screenshot('123.png')

print(bro.page_source)

selenium的使用

1、概念

2、selenium的演示程序

3、selenium的基本使用

4、动态加载数据的捕获

5、动作链

6、12306模拟登陆

7、selenium规避风险

8、无头浏览器

推荐阅读