python - 分页链接在我的 BeautfiulSoup Python 代码中重复
问题描述
from bs4 import BeautifulSoup
import requests
import csv
class Parse():
def __init__(self):
self.row_list = []
self.base_url ='https://www.tripadvisor.co.uk'
def parse(self,url): # correct
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
response = requests.get(url,headers).text
soup = BeautifulSoup(response,'html.parser')
next_link = soup.find('a',class_='_23XJjgWS _1hF7hP_9 _2QvUxWyA')
next_page = self.base_url+next_link.attrs['href']
cards = soup.find_all('section',class_='_2TabEHya _3YhIe-Un')
for card in cards:
name = card.find('div',class_='_1gpq3zsA _1zP41Z7X').text
rating = str(card.find('svg',class_='zWXXYhVR'))
rating = self.remove(filter_col=rating)
review_count = card.find('span',class_='DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy').text
status = card.find('div',class_='DrjyGw-P _26S7gyB4 _3SccQt-T').text
row_list = [name,rating,status,review_count]
return next_page,row_list
def remove(self,filter_col):
rating = filter_col.split(' ')[1]
rating = rating[-3:]
return rating
def write_csv(self,row_list):
with open('top_sites.csv','w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerows(row_list)
if __name__=='__main__':
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html"
parsing = Parse()
next_url,row_list = parsing.parse(url=url)
print(next_url)
PS C:\Users\Caspe\PycharmProjects\Selenium Test> & "c:/Users/Caspe/PycharmProjects/Selenium Test/.venv/Scripts/python.exe" "c:/Users/Caspe/PycharmProjects/Selenium Test/Demo/tripadvisor_topattract.py"
https://www.tripadvisor.co.uk/Attractions-g294190-Activities-Myanmar.html
PS C:\Users\Caspe\PycharmProjects\Selenium Test>
我正在尝试使用 BeautifulSoup 从 TripAdvisor 网站上抓取数据。链接:https ://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html 不是转到下一页,而是重复链接本身。我的问题有解决方案吗?我为汤选择了正确的选择器,并且能够抓取数据。
解决方案
要使分页工作,有必要更改-oa<index>-
URL 中的部分:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa{}-Myanmar.html"
data = []
for page in range(0, 4): # <--- increase page count here
print("Getting page {}..".format(page))
soup = BeautifulSoup(
requests.get(url.format(page * 30)).content, "html.parser"
)
titles = soup.select('span[name="title"]')
for title in titles:
no, t = title.get_text(strip=True, separator="|").split("|")
rating = title.find_next("svg")
review_count = rating.find_next("span")
data.append(
(
no,
t,
rating["title"],
review_count.text,
review_count.find_next(
"div", class_="DrjyGw-P _26S7gyB4 _3SccQt-T"
).text,
)
)
with open("data.csv", "w") as f_out:
w = csv.writer(f_out)
w.writerows(data)
写道data.csv
(来自 LibreOffice 的屏幕截图):
推荐阅读
- ruby-on-rails - Rails 5:视图/控制器中未初始化的常量,但不是控制台
- r - 在 mlr 和 parallelMap 中可以并行化多个级别吗?例如 mlr.tuneParams 和 mlr.benchmark
- jboss - log4j2 似乎没有遵循我的日志滚动策略
- reactjs - 将 fetch 操作转换为 axios 操作
- c# - “必须声明标量变量@dni”
- c# - 如何修复清单中的不匹配引用和下载的程序集 LINQPad.exe 的标识
- javascript - 如何将相同的选项附加到多个选择下拉框?
- python - Flask Dash 将回调中生成的变量传递给另一个回调
- semplot - 如何为参数估计和协方差单独标记 edge.label.position
- c# - 在 .Net Core 中使用 CSharpCodeProvider