python - 如何包装所有 BeautifulSoup 现有的查找/选择方法以添加其他逻辑和参数?
问题描述
我有一个重复的健全性检查过程,我在大多数调用 BeautifulSoup 对象时都会经历:
- 进行函数调用(
.find
、.find_all
、.select_one
和.select
大多数情况下) - 检查以确保找到元素
- 如果找不到,我会提出一个 custom
MissingHTMLTagError
,在那里停止该过程。
- 如果找不到,我会提出一个 custom
- 尝试从元素中检索属性(使用
.get
或getattr
)- 如果没有找到,我提出一个习惯
MissingHTMLAttributeError
- 如果没有找到,我提出一个习惯
- 返回一个:
- 字符串,当它是单个元素的单个属性时 (
.find
and.select_one
) .find_all
字符串列表,当它是多个元素(和.select
)的单个属性时.find_all
dict,当它是多个元素(和.select
)的两个属性(键/值对)时
- 字符串,当它是单个元素的单个属性时 (
我创建了以下解决方案,它充当 BeautifulSoup 方法的代理(不太优雅)。但是,我希望有一个更简单的方法来实现这一点。基本上,我希望能够将所有 BeautifulSoup 方法修补到:
- 允许传递一个额外的参数,以便在一次调用中完成上述步骤
- 如果使用上述任何方法而不提供额外参数,我想像正常一样返回 BeautifulSoup 对象,或者
MissingHTMLTagError
如果返回值为 None 或空列表,则提高。
大多数情况下,下面的函数与一个类变量 ( self._soup
) 一起使用,它只是一个最近的 BeautifulSoup 对象requests.Response
。
from bs4 import BeautifulSoup
def get_html_value(self, element, attribute=None, soup=None, func="find", **kwargs):
"""A one-step method to return html element attributes.
A proxy function that handles passing parameters to BeautifulSoup object instances
while reducing the amount of boilerplate code needed to get an element, validate its existence,
then do the same for the attribute of that element. All while managing raising proper exceptions for debugging.
**Examples:**
# Get a single attribute from a single element using BeautifulSoup.find
>> self.get_html_value("a", "href", attrs={"class": "report-list"})
>> "example.com/page"
# Get a single attribute from multiple elements using using BeautifulSoup.find_all
>> self.get_html_value("a", "href", func="find_all", attrs={"class": "top-nav-link"})
>> ["example.com/category1", "example.com/category2", "example.com/category3"]
# Getting key/value pairs (representing hidden input fields for POST requests)
# from a fragment of the full html page (login_form) that only contains the form controls
>> self.get_html_value("input", ("name", "value"), soup=login_form, func="find_all", attrs={"type": "hidden"})
>> {"csrf_token": "a1b23c456def", "viewstate": "wxyzqwerty"}
# Find an element based on one of its parents using func="select_one"
>> account_balance = self.get_html_value("div#account-details > section > h1", func="select_one")
>> account_balance.string
>> "$12,345.67"
# Using func="select" with no attribute will return BeautifulSoup objects
>> self.get_html_value("div#accounts > div a", func="select")
>> [<a href="...">Act. 1</a>, <a href="...">Act. 2</a>, <a href="...">Act. 3</a>]
# Using func="select" with attribute will return list of values
>> self.get_html_value("div#accounts > div a", attribute="href", func="select")
>> ["example.com/account1", "example.com/account2", "example.com/account3"]
"""
if not any([soup, self._soup]):
raise ValueError("Class property soup not set and soup parameter not provided")
elif soup:
# provide parsing for strings and requests.Responses
if isinstance(soup, str):
soup = BeautifulSoup(soup, "html.parser")
elif isinstance(soup, requests.Response):
soup = BeautifulSoup(soup.text, "html.parser")
else:
soup = self._soup
if not isinstance(attribute, (str, tuple)):
raise TypeError("attribute can only be a string or a tuple")
if isinstance(attribute, tuple) and len(attribute) != 2:
raise ValueError("attribute can only be a string or tuple of 2 strings (key/value pairing)")
bs_func = getattr(soup, func)
if not bs_func:
raise AttributeError("Method %s not found in the BeautifulSoup package" % func)
bs_element = bs_func(element, **kwargs) if kwargs else bs_func(element)
if not bs_element:
raise MissingHtmlError(self, element, None, soup, func, kwargs)
if attribute:
if isinstance(attribute, str):
# handle soup.find and soup.select_one
if isinstance(bs_element, list):
# single attribute for multiple elements
bs_attributes = []
for el in bs_element:
el_attribute = el.get(attribute)
if not el_attribute:
raise MissingHtmlError(self, element, attribute, soup, func, kwargs)
bs_attributes.append(el_attribute)
return bs_attributes
else:
# single attribute for single element
bs_attribute = bs_element.get(attribute)
if not bs_attribute:
raise MissingHtmlError(self, element, attribute, soup, func, kwargs)
return bs_attribute
else:
# handle soup.find_all and soup.select
key, value = attribute
if isinstance(bs_element, list):
# attribute pairs for multiple elements
bs_attributes = {}
for el in bs_element:
el_key = el.get(key)
if el_key is None:
raise MissingHtmlError(self, element, attribute, soup, func, kwargs)
bs_attributes[el_key] = el.get(value, "")
return bs_attributes
else:
# attribute pair for a single element
el_key = bs_element.get(key)
if el_key is None:
raise MissingHtmlError(self, element, attribute, soup, func, kwargs)
return {el_key: bs_element.get(value, "")}
# no attribute was provided, so return the requested element(s)
return bs_element
无论如何都要包装 BeautifulSoup 的所有暴露.find
和.select
-type 方法,所以我仍然可以正常使用这些方法(例如:)soup.find()
,而不必使用我的解决方法功能?
解决方案
我相信我已经找到了一种简洁合理的方法来完成我正在寻找的以下包装器:
from bs4 import BeautifulSoup
from functools import wraps
import requests
import inspect
import abc
class HTMLParseError(Exception):
pass
class MissingHTMLTagError(Exception):
pass
class MissingHTMLAttributeError(Exception):
pass
class MyClass(metaclass=abc.ABCMeta):
def __init__(self):
self._sess = requests.Session()
self._sess.hooks["response"].append(self._session_hook)
self._resp = None
self._soup = None
def _session_hook(self, response, *args, **kwargs):
"""Implicitly sets private instance variables for seamless state-tracking and less boilerplate code"""
self._resp = response
#if "html" in self._resp.headers["content-type"]:
self.get_soup()
def _wrapped_soup(self, soup):
def soup_wrapper(fn):
@wraps(fn)
def wrapped_soup(*args, **kwargs):
extract = kwargs.pop("extract", None)
if not isinstance(extract, (str, tuple, type(None))):
raise TypeError("extract can only be of type None, str, or tuple")
elif isinstance(extract, tuple) and len(extract) != 2:
raise TypeError("extract tuple can only contain two values; key/value pair")
elements = fn(*args, **kwargs)
if not elements:
raise MissingHTMLTagError()
elif not extract:
return elements
elif isinstance(elements, list):
# handle `soup.find_all` and `soup.select`
if isinstance(extract, str):
# single attribute for multiple elements
attribs = list()
for el in elements:
# covers element attributes, as well soup properties like `.string`
el_attrib = el.get(extract) or getattr(el, extract)
if not el_attrib:
raise MissingHTMLAttributeError()
attribs.append(el_attrib)
return attribs
else:
# attribute pairs for multiple elements
attribs = dict()
key, value = extract
for el in elements:
el_key = el.get(key)
if el_key is None:
raise MissingHTMLAttributeError()
attribs[el_key] = el.get(value, "")
return attribs
else:
if isinstance(extract, str):
# single attribute for single element
# covers element attributes, as well soup properties like `.string`
attrib = elements.get(extract) or getattr(el, extract)
if not attrib:
raise MissingHTMLAttributeError()
return attrib
else:
# attribute pair for a single element
key, value = extract
el_key = elements.get(key)
if el_key is None:
raise MissingHTMLAttributeError()
return {el_key: elements.get(value, "")}
return wrapped_soup
# wrap all methods that start with find or select
applicable_funcs = [
f for f in dir(soup)
if f.startswith("find")
or f.startswith("select")
]
for func in applicable_funcs:
setattr(soup, func, soup_wrapper(getattr(soup, func)))
return soup
def get_soup(self):
try:
self._soup = self._wrapped_soup(BeautifulSoup(self._resp.text, "html.parser"))
except HTMLParseError:
# since this is implicit we need to fail gracefully
#self.logger.warning("Failed to parse a response whose Content-Type header was set to text/html")
pass
cls = MyClass()
cls._sess.get("https://www.example.com")
test = cls._soup.find("a", extract="href")
print("test:", test)
推荐阅读
- sql - 如何通过最大值知道其他列的值(PostgreSQL)?
- database - MongoDB 是否使用严格单调递增的 ID 导出?
- vue.js - 如何覆盖 Vue-cli3 中的环境变量?
- matlab - Octave - 为什么 surf 不起作用,而 trisurf 起作用?
- algorithm - 带数值的伪代码流程图
- python - django 外键关系
- corda - 无论如何,不使用 Schema 就可以查询 Vault 中的状态?
- csv - 从 CSV Prestashop 1.7.4.0 导入客户时出现问题
- spring - Spring data jpa:即使在引发异常后也会执行代码
- javascript - 绕过并发 ajax 调用限制的最佳实践