python - Web Scraper 问题:只能解析字符串
问题描述
我最近在香港选举平台上写了一个2级刮板,效果很好。该代码允许我检索地区级别的信息。代码如下:
from typing import List
import requests
import csv
from lxml import etree
from urllib.parse import urljoin
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can/A.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls(self, response):
raw_tree = etree.HTML(response)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#self.pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: List[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
platform_urls = self.extract_info_urls(response)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
尽管如此,因为我想提高我的技能,我尝试做一个 3 级刮刀。我想同时刮掉18个区所有政客的讲台。
class hongkongelection:
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
if r.text:
html_result = r.text
print('get result la')
return html_result
else:
print('get result fail la')
return ''
def extract_info_urls_district(self, response):
raw_tree = etree.HTML(response)
district_urls = raw_tree.xpath('//*[@id="content-area"]/table[2]/tr/td/div/ol/li/a/@href')
scraped_url_district = "https://www.elections.gov.hk/dc2019/eng/intro_to_can.html"
#pdf_url = "../eng/intro_to_can/A.html"
district_urls = [urljoin(scraped_url_district, pdf_url) for pdf_url in district_urls]
return district_urls
def extract_info_urls_platform(self, district_urls):
raw_tree = etree.HTML(district_urls)
platform_urls = raw_tree.xpath('//*[@id="table-district-member"]/tbody/tr/td[6]/div/a/@href|//*[@id="table-district-member"]/tbody/tr/td[4]/div/a/@href')
scraped_url = "https://www.elections.gov.hk/dc2019/eng/intro_to_can/H.html"
#pdf_url = "../../pdf/intro_to_can/A01_1_ENG.html"
platform_urls: list[str] = [urljoin(scraped_url, pdf_url) for pdf_url in platform_urls]
return platform_urls
def extract_info(self, platform_urls):
raw_tree = etree.HTML(platform_urls)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def save_information(self, raw_json):
with open('platform.csv', 'a+', encoding='UTF-8') as out_f:
csv_writer =csv.DictWriter(out_f, raw_json.keys())
if out_f.tell() == 0:
csv_writer.writeheader()
csv_writer.writerow(raw_json)
def run(self):
response = self.send_request(self.url)
district_urls = self.extract_info_urls_district(response)
platform_urls = self.extract_info_urls_platform(district_urls)
for url in platform_urls:
info_response = self.send_request(url)
raw_json =self.extract_info(info_response)
raw_json['platform_url'] = url
self.save_information(raw_json)
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
但它失败了。我想知道我做错了什么。
完整追溯:
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\pydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2020.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 71, in <module>
runner.run()
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 61, in run
platform_urls = self.extract_info_urls_platform(district_urls)
File "C:/Users/BUS-0556/PycharmProjects/webscraper/scraper for platform more.py", line 31, in extract_info_urls_platform
raw_tree = etree.HTML(district_urls)
File "src/lxml/etree.pyx", line 3185, in lxml.etree.HTML
File "src/lxml/parser.pxi", line 1895, in lxml.etree._parseMemoryDocument
ValueError: can only parse strings
感谢您的帮助和时间 - 期待从这个了不起的社区中学习!
解决方案
您试图使用 lxml 解析器直接抓取内容,甚至不发送请求。我已经对您的 xpath 进行了一些更改,但这是不必要的。我还使用生成器来提高效率。确保save_information
在脚本中添加这个方法,因为我不得不把它踢出来看看发生了什么:
import csv
import time
import random
import requests
from lxml import etree
from typing import List
from urllib.parse import urljoin
class hongkongelection(object):
def __init__(self):
self.url = 'https://www.elections.gov.hk/dc2019/eng/intro_to_can.html'
def send_request(self, url):
r = requests.get(url)
r.raise_for_status()
return r.text
def extract_info_urls_district(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//a[contains(@href,"/intro_to_can/")]/@href'):
yield urljoin(url,pdf_url)
def extract_info_urls_platform(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
for pdf_url in raw_tree.xpath('//*[@id="table-district-member"]//a[contains(@href,"/pdf/intro_to_can/") and contains(.,"Text")]/@href'):
yield urljoin(url,pdf_url)
def extract_info(self, url):
res = self.send_request(url)
raw_tree = etree.HTML(res)
dict_result = {}
dict_result['namelist'] = raw_tree.xpath("//*[@id=\"main\"]/p[2]/span[2]/text()")
dict_result['namelist'] = [x.replace("\r\n", "") for x in dict_result['namelist']]
dict_result['partylist'] = raw_tree.xpath("//*[@id=\"main\"]/p[5]/span[2]/text()")
dict_result['partylist'] = [x.replace("\r\n", "") for x in dict_result['partylist']]
dict_result['message_list'] = raw_tree.xpath("//*[@id=\"main\"]/p[8]/span/text()")
dict_result['message_list'] = [x.replace("\r\n", "") for x in dict_result['message_list']]
return dict_result
def run(self):
for district_url in self.extract_info_urls_district(self.url):
for url in self.extract_info_urls_platform(district_url):
raw_json = self.extract_info(url)
raw_json['platform_url'] = url
print(raw_json)
time.sleep(random.randint(3,8))
if __name__ == '__main__' :
runner = hongkongelection()
runner.run()
推荐阅读
- java - android:RecyclerView set adapter
- azure - Xamarin iOS 在 Azure 中是否可以为同一个 Web API 提供两个通知中心,一个处于生产模式,另一个处于开发模式?
- java - 从 Web SRC URL 生成文件
- excel - Excel 将一维数组转置为多维数组
- python - 检查是否包含字符串时获得完全匹配是行
- android-uiautomator - 如何计算 UiAutomator 中所有 RecyclerView 项目的不同项目
- python - 唯一约束失败:“app”_customuser.username
- javascript - 强制将块拆分为某个包
- django - 多个附件django电子邮件
- linq - 实体框架查询单个查询中的相关实体