python - 使用“scrapy”进行 Python 网络抓取:不从 span 中提取文本
问题描述
问题出在第 54 行
port = data.xpath('./td[3]/span[@class="port"]/text()').extract_first()
完整代码
import json
import re
import urllib
from html.parser import HTMLParser
from urllib.parse import urljoin
from scrapy import Field, Item, Selector
from scrapy.http import FormRequest, HtmlResponse, Request
from scrapy.spiders import CrawlSpider
from requests import Session
class ProxyServersPro(Item):
ip = Field()
port = Field()
country = Field()
speed = Field()
protocol = Field()
anon = Field()
lastcheck = Field()
speed = Field()
class ProxyServers(CrawlSpider):
name = "ProxyServersProCrawler"
allowed_domains = ["proxyservers.pro"]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
start_url = [
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/2",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/3",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/4",
"https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/5",
]
def __init__(self):
super(ProxyServers, self).__init__()
def start_requests(self):
for url in self.start_url:
yield Request(url, callback=self.parse_companies, headers=self.headers)
def parse_companies(self, response):
table = response.xpath('//table[@class="table table-hover"]/tbody/tr')
for data in table:
ip = data.xpath("./td[2]/a/text()").extract_first()
country = data.xpath("./td[4]/text()").extract_first()
protocol = data.xpath("./td[7]/text()").extract_first()
anon = data.xpath("./td[8]/text()").extract_first()
port = data.xpath('./td[3]/span[@class="port"]/text()').extract_first()
item = ProxyServersPro()
item["ip"] = ip
item["country"] = country
item["protocol"] = protocol
item["anon"] = anon
item["port"] = port
yield item
解决方案
Scrapy 没有提取价值,因为跨度中没有文本。
尝试这个:
(Pdb) data.xpath('./td[3]/span/@data-port').extract_first()
'050F040E'
推荐阅读
- reactjs - React Typescript 获取:useState UseEffect pokeapi
- windows - 在 Windows 中批量编译 Kotlin 源代码时出错
- android - 如何修复错误:没有方法签名:build_754ga1wplg93aizkkd14wn52w.android()
- flutter - 如何删除 ExpansionTile 中的默认填充顶部和底部?
- racket - 有人可以解释为什么我会违反 mpair 合同吗
- twilio - 系统发起的 IVR 呼叫以捕获语音响应
- python - 如何使用python请求发布从url以.aspx结尾的页面下载excel
- javascript - 试图居中标签和
标记内
? - apache-poi - Spring Boot,Maven,Java - 从数据库中导出 excel 文件的数据...面临错误:java.lang.ClassNotFoundException: org.apache.poi.ss.usermodel.Row
- javascript - 无法从 Angular2-wizard 表单步进器呈现的 html 中删除边框