python - 删除抓取数据之间的空格

问题描述

我正在尝试从网站上抓取一些数据并将其保存到 csv 文件中。当我得到吓坏的数据时，每行之间都有很大的空间。我希望能够删除这个不必要的空间。下面是我的代码

from bs4 import BeautifulSoup
import requests
import csv

#URL to be scraped
url_to_scrape = 'https://www.sainsburys.co.uk/shop/gb/groceries/meat-fish/CategoryDisplay?langId=44&storeId=10151&catalogId=10241&categoryId=310864&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&beginIndex=0&promotionId=&listId=&searchTerm=&hasPreviousOrder=&previousOrderId=&categoryFacetId1=&categoryFacetId2=&ImportedProductsCount=&ImportedStoreName=&ImportedSupermarket=&bundleId=&parent_category_rn=13343&top_category=13343&pageSize=120#langId=44&storeId=10151&catalogId=10241&categoryId=310864&parent_category_rn=13343&top_category=13343&pageSize=120&orderBy=FAVOURITES_ONLY%7CSEQUENCING%7CTOP_SELLERS&searchTerm=&beginIndex=0&hideFilters=true'
#Load html's plain data into a variable
plain_html_text = requests.get(url_to_scrape)
#parse the data
soup = BeautifulSoup(plain_html_text.text, "lxml")
#
# #Get the name of the class

csv_file = open('sainsburys.csv', 'w')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Description','Price'])

for name_of in soup.find_all('li',class_='gridItem'):
    name = name_of.h3.a.text
    print(name)
    try:
        price = name_of.find('div', class_='product')
        pricen = price.find('div', class_='addToTrolleytabBox').p.text
        print(pricen)
        csv_writer.writerow([name, pricen])
    except:
        print('Sold Out')
        print()

csv_writer.writerow([name, pricen])
csv_file.close()

我得到的结果是这样的：

                                       J. James Chicken Goujons 270g



        £1.25/unit


                                        Sainsbury's Chicken Whole Bird (approx. 0.9-1.35kg)



        £1.90/kg


                                        Sainsbury's British Fresh Chicken Fajita Mini Fillets 320g



        £2.55/unit


                                        Sainsbury's Slow Cook Fire Cracker Chicken 573g



        £4.75/unit

谢谢

标签： pythonweb-scraping

解决方案

如果您记录您的网络流量并过滤它以仅查看 XHR 资源，您会发现一个与 AJAX Web 应用程序对话的资源。它与服务器对话，服务器生成 HTML（不幸的是，不完全是 JSON。它是 HTML 烘焙到 JSON 响应中）。这并不是真正需要的，因为您的代码似乎可以正常抓取页面。然而，这是获得产品的一种更可爱的方式。您也不必担心分页之类的事情。正如其他人已经指出的那样，要去除前导和尾随空格，请使用str.strip. 在此示例中，我只打印前十个产品（共 114 个）。是的，我意识到我可以将查询字符串附加到 url 而不是创建params字典，但这样更容易阅读和更改：

import requests
from bs4 import BeautifulSoup


class Product:

    def __init__(self, html):
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(html, "html.parser")
        self.name, _, self.weight = soup.find("a").text.strip().rpartition(" ")
        self.price_per_unit = soup.find("p", {"class": "pricePerUnit"}).text.strip()
        self.price_per_measure = soup.find("p", {"class": "pricePerMeasure"}).text.strip()


    def __str__(self):
        return f"\"{self.name}\" ({self.weight}) - {self.price_per_unit}"

url = "https://www.sainsburys.co.uk/webapp/wcs/stores/servlet/AjaxApplyFilterBrowseView"

params = {
    "langId": "44",
    "storeId": "10151",
    "catalogId": "10241",
    "categoryId": "310864",
    "parent_category_rn": "13343",
    "top_category": "13343",
    "pageSize": "120",
    "orderBy": "FAVOURITES_ONLY|SEQUENCING|TOP_SELLERS",
    "searchTerm": "",
    "beginIndex": "0",
    "hideFilters": "true",
    "requesttype": "ajax"
}

response = requests.get(url, params=params)
response.raise_for_status()

product_info = response.json()[4]["productLists"][0]["products"]

products = [Product(p["result"]) for p in product_info[:10]]

for product in products:
    print(product)

输出：

"Sainsbury's Chicken Thigh Fillets" (640g) - £3.40/unit
"Sainsbury's Mini Chicken Breast Fillets" (320g) - £2.00/unit
"Sainsbury's Chicken Thighs" (1kg) - £1.95/unit
"Sainsbury's Chicken Breast Fillets" (300g) - £1.70/unit
"Sainsbury's Chicken Drumsticks" (1kg) - £1.70/unit
"Sainsbury's Chicken Thigh Fillets" (320g) - £1.85/unit
"Sainsbury's Chicken Breast Diced" (410g) - £2.40/unit
"Sainsbury's Chicken Small Whole Bird" (1.35kg) - £2.80/unit
"Sainsbury's Chicken Thighs & Drumsticks" (540g) - £1.00/unit
"Sainsbury's Chicken Breast Fillets" (640g) - £3.60/unit
>>> product.price_per_measure
'£5.63/kg'
>>>

python - 删除抓取数据之间的空格 - Python

问题描述

解决方案

推荐阅读