python - Scrapy Crawler:从页面中抓取列表
问题描述
目标:抓取此页面
https://www.cardplayer.com/poker-tournaments/monthly/2021/06
然后在每个页面上获取所有锦标赛的列表。这是我的代码
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import pandas as pd
mydf = pd.DataFrame()
class TournamentsSpider(CrawlSpider):
name = 'tournaments'
allowed_domains = ['www.cardplayer.com']
start_urls = ['https://www.cardplayer.com/poker-tournaments/monthly/2021/06']
rules = (
Rule(LinkExtractor(restrict_xpaths='/html/body/div[5]/div/div[2]/div[2]/div[3]/div/table/tbody/tr/td[2]/a'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
# I'm aware that some of the pages have two tables(I was thinking an if statement on the length of response and then running for table 1 on 1 table pages and table 2 on 2 table pages
for series in response.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbody'):
mydf["Event"] = series.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbodytr/td[1]/a/text()')
mydf["start"] = series.xpath('.//tr/td[2]/text()')
mydf["days"] = series.xpath('.//tr/td[3]/text()')
mydf["buyin"] = series.xpath('.//tr/td[4]/text()')
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(TournamentsSpider)
process.start()
print(mydf)
我可以看到爬虫找到了所有 URL,但输出只返回 1 页,所以我做错了。
解决方案
这是我尝试使用 bs4 执行此操作的方法,只需输入您想要收集的年数。
# Get Product Page Links
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
baseurl = 'https://www.cardplayer.com/poker-tournaments/monthly/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
Tournaments = []
def GetPageData(url):
#Get singular page info
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
# Get all tr elements with empty class
productlist = soup.find_all('tr', class_='')
for i, item in enumerate(productlist):
# Skip first row of table titles
if (i != 0):
# remove spaces
RawTournamentInfo = str(item.text).strip()
# splits into list by new lines
RawTournamentInfo = RawTournamentInfo.splitlines()
# Create empty in strings
Date = ''
Name = ''
Location = ''
# had to loop of list, forsome reason not allowing direct calling
for i, item in enumerate(RawTournamentInfo):
if i == 0: Date = item
if i == 1: Name = item
if i == 2: Location = item
# Creating object and saving to list
if (Date != "Dates") and (Date != 'No tournament series found.'):
print('Added: ', Name)
tournament = {
'date': Date,
'name': Name,
'location': Location
}
Tournaments.append(tournament)
r.close()
def GetTournaments(yearsToCollect):
#Get Current Year/Month
today = datetime.today()
currentMonth = today.month
currentYear = today.year
for year in range(yearsToCollect):
#Finish current Year
if (year == 0):
for i in range(12 - currentMonth):
GetPageData(baseurl + str(currentYear) + '/' + str(currentMonth + i))
#All other years
else:
for i in range(12):
GetPageData(baseurl + str(currentYear + year) + '/' + str(i))
# Save to .xlsx
Tournamentsdf = pd.DataFrame(Tournaments)
Tournamentsdf.to_excel('Tournaments.xlsx', index=False)
if __name__ == "__main__":
yearsToCollect = 2
GetTournaments(yearsToCollect)
推荐阅读
- video - 如何自动关闭 GStreamer 应用程序?
- node.js - 如何使用 zeit now server 为 NodeJS 表单应用程序提供服务
- android - $BITRISE_DEPLOY_DIR 有哪些可用文件?
- php - 从数据库的codeigniter中的开始时间字段计算结束时间
- javascript - 使用 angularjs 将 yaml 数据解析到浏览器
- javascript - React-testing-library:在组件的上下文中找不到“存储”
- python - 在 VS 代码中从继承的 python 类中覆盖方法时获取自动完成
- flutter - Flutter 如何使 Wrap 小部件中的 Text 小部件从前一个小部件的末尾开始,而不是从前一个小部件开始?
- scala - 以“Scala 方式”基于另一个集合对一组值进行排序
- php - 如何删除codeigniter中的index.php?,我尝试了正常的技术,但没有奏效