首页 > 解决方案 > 如果满足条件,则从 CSV 文件中删除行

问题描述

我正在尝试抓取pickels.com.au。

如果链接相同并且价格不同,我正在尝试更新 pickels_dataset.csv 文件,我将删除列表并将新行插入到 CSV 文件中,但它不会从中删除旧条目CSV 文件。

删除和更新 CSV 文件中的行的最佳方法是什么。

下面是我的代码...

import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep


with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
    auction_csv_writer = csv.writer(auction_csv_file)
    live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
    api_request = requests.get(url=live_auctions_api)
    for auctions in api_request.json():
        auction_link = auctions.get('viewSaleListingLink')
        if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
            auction_request = requests.get(url=auction_link)
            response = Selector(text=auction_request.text)

            sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
            sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
            if sales_id == []:
                continue
            auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
            auction_sale_link_requests = requests.get(url=auction_sale_link)

            auctions_data = auction_sale_link_requests.json().get('SearchResults')
            if auctions_data == []:
                print("NO RESULTS")
            for auction_data in auctions_data:
                if int(auction_data.get('MinimumBid')) > 0:
                    ids = auction_data.get('TargetId')
                    main_title = auction_data.get('Title')
                    short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
                    make = auction_data.get('M ake')
                    model = auction_data.get('Model')
                    variant = auction_data.get('Series')
                    transmission = auction_data.get('Transmission')
                    odometer = auction_data.get('Odometer')
                    state = auction_data.get('Location').get('State')
                    sale_price = auction_data.get('MinimumBid')
                    link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
                    link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
                    sale_date = auction_data.get('SaleEndString')
                    
                    auction_values = [
                        main_title, short_title, make, 
                        model, variant, transmission, odometer, 
                        state, "${:,.2f}".format(sale_price).strip() , 
                        link, sale_date
                    ]
                    
                    with open('pickels_dataset.csv', 'r+') as csv_read:
                        auction_reader = list(csv.reader(csv_read))
                        for each in auction_reader:
                            if link in each:
                                each_link, each_price = each[9], each[0]
                                if (link == each_link) and (sale_price != each_price):
                                    auction_reader.clear()
                                    print('New list found, old list deleted')
                                    auction_csv_writer.writerow(auction_values)
                                    print('New value added')
                                    continue
                                elif (link == each[9]) and (sale_price == each[0]):
                                    print('Same result already exist in the file')
                                    continue
                        else:
                            auction_csv_writer.writerow(auction_values)
                            print('Unique result found and added.')
                            break

标签: pythoncsvweb-scrapingscrapy

解决方案


您当前的脚本正在打开您的拍卖 CSV 文件以进行附加,然后在它仍处于打开状态时尝试再次打开以进行读取。这可能是它没有按预期更新的原因。

更好的方法是首先将现有保存的拍卖文件的全部内容读入字典。关键可能是link这样可以很容易地确定您是否已经看过现有的拍卖。

接下来抓取当前的拍卖并saved_auctions根据需要更新字典。

最后,将内容写saved_auctions回 CSV 文件。

例如:

import requests
from scrapy.selector import Selector
import csv
import re


auction_filename = 'pickels_dataset.csv'

# Load existing auctions into a dictionary with link as key
saved_auctions = {}

with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
    for row in csv.reader(f_auction_file):
        saved_auctions[row[9]] = row      # dictionary key is link
    
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)

for auctions in api_request.json():
    auction_link = auctions.get('viewSaleListingLink')
    
    if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
        auction_request = requests.get(url=auction_link)
        response = Selector(text=auction_request.text)

        sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
        sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
        
        if sales_id == []:
            continue
        
        auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
        auction_sale_link_requests = requests.get(url=auction_sale_link)

        auctions_data = auction_sale_link_requests.json().get('SearchResults')
        
        if auctions_data == []:
            print("NO RESULTS")
        
        for auction_data in auctions_data:
            if int(auction_data.get('MinimumBid')) > 0:
                ids = auction_data.get('TargetId')
                main_title = auction_data.get('Title')
                short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
                make = auction_data.get('Make')
                model = auction_data.get('Model')
                variant = auction_data.get('Series')
                transmission = auction_data.get('Transmission')
                odometer = auction_data.get('Odometer')
                state = auction_data.get('Location').get('State')
                minimum_bid = auction_data.get('MinimumBid')
                sale_price = "${:,.2f}".format(minimum_bid).strip()
                link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
                link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
                sale_date = auction_data.get('SaleEndString')
                
                auction_values = [
                    main_title, short_title, make, 
                    model, variant, transmission, odometer, 
                    state, sale_price, 
                    link, sale_date
                ]

                if link in saved_auctions:
                    if saved_auctions[link][8] == sale_price:
                        print('Same result already exists in the file')
                    else:
                        print('New value updated')
                        saved_auctions[link] = auction_values        # Updated the entry
                else:
                    print('New auction added')
                    saved_auctions[link] = auction_values
                    
# Update the saved auction file                    
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
    csv_auction_file = csv.writer(f_auction_file)
    csv_auction_file.writerows(saved_auctions.values())

如果您还想删除不再有效的拍卖,那么最好直接忽略保存的文件并按原样写入所有当前条目。


推荐阅读