首页 > 技术文章 > selenium检索代理,代理验证,保存本地

my-global 2020-03-12 10:47 原文

  selenium检索代理:

import selenium
import selenium.webdriver

url="http://www.kuaidaili.com/free/inha/4/"
driver=selenium.webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(10)
elems=driver.find_elements_by_xpath("//tbody/tr")
#print(type(elems),elems)
for elem in elems:
    print(elem.find_elements_by_xpath("./td")[0].text)
    print(elem.find_elements_by_xpath("./td")[1].text)

driver.close()

  代理验证:

import  urllib.request
try:
    httpproxy=urllib.request.ProxyHandler({"http":"10.36.132.16:808"})#代理无需账号
    opener=urllib.request.build_opener(httpproxy)#创建一个打开器
    request=urllib.request.Request("http://www.baidu.com/") #访问百度
    response=opener.open(request,timeout=10)#打开网页,内置代理服务器
    print(response.read())
    print("OK")
except:
    print("NO")

  检索可以使用的代理,并保存到本地txt:

import selenium
import selenium.webdriver
import urllib
import urllib.request
import lxml
import lxml.etree
#测试可以,不过速度比较慢,第二个函数改成urllib会快些

def urllist(url):
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
    request=urllib.request.Request(url,headers=headers)
    request.add_header("Connection", "keep-alive")  # 一直活着
    response=urllib.request.urlopen(request)
    data = response.read()
    mytree = lxml.etree.HTML(data)
    numbers = mytree.xpath("//*[@id=\"listnav\"]/ul/li[9]/a/text()")
    #print(numbers)
    numbers1=eval(numbers[0])
    urllist=[]
    for i in range(1,numbers1+1):
        urllist.append(url+str(i)+"/")
    return urllist

def textlist(url):
    #url="https://www.kuaidaili.com/free/inha/2/"
    driver=selenium.webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(10)  #控制操作的时间,在10秒之内,如果元素出现,就继续执行,元素没有出现最多10秒
    #批量提取  elements  有s
    elems=driver.find_elements_by_xpath("//tbody/tr")
    dailist=[]
    for elem in elems:
        #print(elem.find_elements_by_xpath("./td")[0].text)  #./td 表示当前子目录(路径)下的td 即匹配所有的td  *匹配任何元素节点;  //*选取文档中的所有元素
        #print(elem.find_elements_by_xpath("./td")[1].text)
        ipnum=elem.find_elements_by_xpath("./td")[0].text
        kounum=elem.find_elements_by_xpath("./td")[1].text
        daili=ipnum+":"+kounum
        try:

            httpproxy=urllib.request.ProxyHandler({"http":daili})#代理无需账号
            opener=urllib.request.build_opener(httpproxy)#创建一个打开器
            request=urllib.request.Request("http://www.baidu.com/") #访问百度
            response=opener.open(request,timeout=10)#打开网页,内置代理服务器
            #print(response.read())
            print(daili)
            print("OK")
            dailist.append(daili)
            return dailist
        except:
            print("NO")

url="https://www.kuaidaili.com/free/inha/"
savefilepath="daili.txt"
savefile=open(savefilepath,"wb")
for urls in urllist(url):
    textlist(urls)
    if len(textlist(urls))!= 0:
        dailistr = " ".join(textlist(urls))
        savefile.write((dailistr + "\r\n").encode("utf-8"))

 

推荐阅读