首页 > 解决方案 > Python Web Scraper 提前退出

问题描述

所以我现在正在开发一个网络爬虫,除了提前终止脚本之外,它几乎可以工作。我希望它遍历https://www.carsireland.ie/search#q?sortBy=vehicles_prod%2Fsort%2Fpoa%3Aasc%2Cupdated%3Adesc&page=1上的页面并抓取每个广告的网址。即使网站上大约有 52,000 个网址,它也会在每次抓取 2,240 个网址时终止。任何想法将不胜感激。

from requests_html import HTMLSession
import os
from datetime import date

try:

 session = HTMLSession()

 # create csv and write header
 filename = "carsireland/carsireland_urls.csv"
 f = open(filename, "w")

 # open last url file
 last_url_file = "carsireland/last_url.txt"

 # create log dir for the day
 log_dir = "carsireland/logs/" + str(date.today())
 os.mkdir(log_dir)

 # open log file
 log_file = log_dir + "/url"
 log = open(log_file, "w")

 with open(last_url_file, 'r') as url_id:
    # open data reader
    id = url_id.read()
    print("Last URL id: " + str(id))

 # url of search page
 url = 'https://www.carsireland.ie/search#q? 
 sortBy=vehicles_prod%2Fsort%2Fpoa%3Aasc%2Cupdated%3Adesc&page='

 # page number to be incremented
 page = 1

 # boolean to trigger exit when final result page is reached
 alive = True

 list = []

 # while to loop through search pages
 while alive == True:

    # trigger to quit when now new urls have been found
    p = 0

    # create a session with the url
    r = session.get(url+str(page))

    # render the url
    r.html.render(sleep=1, timeout= 20)

    # create a list of all links in the search results container
    urls = r.html.xpath('//*[@id="vehicle-search- 
     root"]/div/div[3]/div[2]/ul', first=True)

    # loop through links and append to list
    for item in urls.absolute_links:
        url_split = item.split("/")
        url_id = url_split[3]
        if url_id > id:
            list.append(item)

            #increment page number
            p = p + 1
    
    # if quit is still zero after parsing links then final search page 
    has been found
    # alive boolean will trigger the exit of the while loop
    if p == 0:
        alive = False

    os.system('cls')
    print("Page " + str(page) + " scraped")

    page = page + 1

# define function for sorting urls
def sort_by_id(url):
        url_split = url.split("/")
        url_id = url_split[3]
        return url_id

# sort urls
sorted_list = sorted(list, key=sort_by_id)

# remove duplicates
#sorted_list = list(dict.fromkeys(sorted_list))

# write urls to csv
for each in sorted_list:
    f.write(str(each) + "\n")
    last_url = each

# close url file
f.close()

# create text file to store last url id
last_url_filename = "carsireland/last_url.txt"
l = open(last_url_filename, "w")

# split url to get id
split = last_url.split('/')

# write to text file
l.write(split[3])

# print complete message
print("\n--- URL scrape complete ---\n")

# write log
log.write("URL scrape succesful\n")
log.write(str(page) + " pages scraped")
log.close()

import carsireland.carsireland_car_scraper as scraper

# run car scraper
scraper.scrape()

except Exception:
 log.write("URL scrape unsuccesful")
 log.close()

标签: pythonwebscreen-scraping

解决方案


推荐阅读