首页 > 解决方案 > Scrapy Crawler:从页面中抓取列表

问题描述

目标:抓取此页面

https://www.cardplayer.com/poker-tournaments/monthly/2021/06

然后在每个页面上获取所有锦标赛的列表。这是我的代码

from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import pandas as pd

mydf = pd.DataFrame()


class TournamentsSpider(CrawlSpider):
    name = 'tournaments'
    allowed_domains = ['www.cardplayer.com']
    start_urls = ['https://www.cardplayer.com/poker-tournaments/monthly/2021/06']

    rules = (
        Rule(LinkExtractor(restrict_xpaths='/html/body/div[5]/div/div[2]/div[2]/div[3]/div/table/tbody/tr/td[2]/a'),
             callback='parse_item', follow=True),
    )

    def parse_item(self, response):
# I'm aware that some of the pages have two tables(I was thinking an if statement on the  length of response and then running for table 1 on 1 table pages and table 2 on 2 table pages

            for series in response.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbody'):
                mydf["Event"] = series.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbodytr/td[1]/a/text()')
                mydf["start"] = series.xpath('.//tr/td[2]/text()')
                mydf["days"] = series.xpath('.//tr/td[3]/text()')
                mydf["buyin"] = series.xpath('.//tr/td[4]/text()')


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(TournamentsSpider)
process.start()

print(mydf)

我可以看到爬虫找到了所有 URL,但输出只返回 1 页,所以我做错了。

标签: pythonscrapy

解决方案


这是我尝试使用 bs4 执行此操作的方法,只需输入您想要收集的年数。

# Get Product Page Links
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

baseurl = 'https://www.cardplayer.com/poker-tournaments/monthly/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
Tournaments = []

def GetPageData(url):
    #Get singular page info
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    # Get all tr elements with empty class
    productlist = soup.find_all('tr', class_='')
    for i, item in enumerate(productlist):
        # Skip first row of table titles
        if (i != 0):
            # remove spaces
            RawTournamentInfo = str(item.text).strip()
            # splits into list by new lines
            RawTournamentInfo = RawTournamentInfo.splitlines()
            # Create empty in strings 
            Date = ''
            Name = ''
            Location = ''
            # had to loop of list, forsome reason not allowing direct calling
            for i, item in enumerate(RawTournamentInfo):
                if i == 0: Date = item
                if i == 1: Name = item
                if i == 2: Location = item
            # Creating object and saving to list
            if (Date != "Dates") and (Date != 'No tournament series found.'):
                print('Added: ', Name)
                tournament = {
                    'date': Date,
                    'name': Name,
                    'location': Location
                }
                Tournaments.append(tournament)
    r.close()

def GetTournaments(yearsToCollect):
    #Get Current Year/Month
    today = datetime.today()
    currentMonth = today.month
    currentYear = today.year

    for year in range(yearsToCollect):
        #Finish current Year
        if (year == 0):
            for i in range(12 - currentMonth):
                GetPageData(baseurl + str(currentYear) + '/' + str(currentMonth + i))
        #All other years
        else:
            for i in range(12):
                GetPageData(baseurl + str(currentYear + year) + '/' + str(i))

    # Save to .xlsx
    Tournamentsdf = pd.DataFrame(Tournaments)
    Tournamentsdf.to_excel('Tournaments.xlsx', index=False)

if __name__ == "__main__":
    yearsToCollect = 2
    GetTournaments(yearsToCollect)


推荐阅读