首页 > 解决方案 > 格式化抓取的数据 Python Beautifulsoup

问题描述

我正在尝试从这个 URL https://www.spoonflower.com/en/shop?on=fabric 设计名称、创建者名称、面料类型、按面料类型的价格刮取 好消息是他们有公共 API 端点,这使得数据提取很简单但问题是他们有不同的设计名称和定价 URL,即收集设计名称和创建者姓名我必须 ping 这个 URL https://pythias.spoonflower.com/search/v1/designs? lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en

对于请求此端点的每种结构类型的定价

https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK¤cy=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en

我得到了正确的数据,但我偶然发现了一些格式问题。

我正在寻找的是这样的东西。 预期输出结果 每种设计都将其面料类型及其价格排成一行。相反,我得到了这种输出 我的输出结果

如果这里的任何人都可以指导我如何获得我正在寻找的 expected_output_result,那就太好了。

下面是我的代码:

import requests
from bs4 import BeautifulSoup
import json
import csv


cookies = {
    'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
    'Content-Type': 'application/json',
    'Origin': 'https://www.spoonflower.com',
    'Connection': 'keep-alive',
    'Referer': 'https://www.spoonflower.com/',
    'Sec-GPC': '1',
    'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
    'TE': 'Trailers',
}

res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
    fabric.append(("_".join(fab.upper().split())))


#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en

item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()



#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
    for fab_type in fabric:
        details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
        details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
        designName = item['name'],
        screeName = item['user']['screenName']
        fabric_name = details_endpoint_response['data']['fabric_code']
        try:
            test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
        except:
            test_swatch_meter = 'N/A'
        try:
            fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
        except:
            fat_quarter_meter = 'N/A'
        try:
            meter = details_endpoint_response['data']['pricing']['METER']['price']
        except:
            meter = 'N/A'
        scraped_items.append({
            'designName': designName,
            'screenName': screeName,
            'fabric_name': fabric_name,
            'test_swatch_meter': test_swatch_meter,
            'fat_quarter_meter': fat_quarter_meter,
            'meter': meter
        })
        print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)

print(json.dumps(scraped_items, indent=2))
        #print(type(details_endpoint))

#print(type(items_json['page_results'][0]))
with open('scraped_data.csv', 'w', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
    writer.writeheader()

    for row in scraped_items:
        writer.writerow(row)


#print(fabric)

标签: pythonpython-3.xbeautifulsoup

解决方案


一种方法是重新配置构建输出的方式。使用字典而不是列表,其中designName, screenName, 后跟值。要记住的一件事是字典不允许重复键,因此必须对列名进行编号,但是如果您愿意,您可以稍后将其删除。

看看这是否得到你想要的:

import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict


cookies = {
    'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
    'Content-Type': 'application/json',
    'Origin': 'https://www.spoonflower.com',
    'Connection': 'keep-alive',
    'Referer': 'https://www.spoonflower.com/',
    'Sec-GPC': '1',
    'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
    'TE': 'Trailers',
}

res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
    fabric.append(("_".join(fab.upper().split())))


#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en

item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()



#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
    for fab_type in fabric:
        details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
        details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
        designName = item['name']
        screenName = item['user']['screenName']
        fabric_name = details_endpoint_response['data']['fabric_code']
        try:
            test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
        except:
            test_swatch_meter = 'N/A'
        try:
            fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
        except:
            fat_quarter_meter = 'N/A'
        try:
            meter = details_endpoint_response['data']['pricing']['METER']['price']
        except:
            meter = 'N/A'
        
        if (designName, screenName) not in items_dict.keys():
            items_dict[(designName, screenName)] = {}

        itemCount = len(items_dict[(designName, screenName)].values()) / 4
        items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
        'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
        'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
        'meter_%02d' %itemCount: meter})
            
        

        print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)


df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

推荐阅读