python - 如何在python中使用requests和beautifulsoup对网站的所有页面进行分页
问题描述
所以我试图从这个网站上搜集浴帘的名称和价格。该站点有超过 200 个页面,但此代码仅适用于前 100 个页面,然后它会再次重复抓取相同的 100 个页面。
import requests
from bs4 import BeautifulSoup
import re
import csv
site = "https://ih1.redbubble.net/image.{}/ur,shower_curtain_closed,square,600x600.1.jpg"
firstrow = ['No.', 'Name', 'Price', 'Image Url']
with open('curtains.csv', 'a', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(firstrow)
csvFile.close()
def main(url):
count = 0
for page in range(1,205):
print('\n','*'*10 , 'Scraping Page # {}'.format(page) , '*'*10)
print('Link # {}'.format(url.format(page)))
final_url = url.format(page)
r = requests.get(final_url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("img[class*=styles__rounded--1lyoH]")
imgs = [img.group(1) for img in re.finditer(r'\.(\d+\.\d{4})', r.text)]
goal = list(dict.fromkeys(imgs))
for tar, go in zip(target, goal):
count += 1
name = tar['alt']
price = tar.find_all_next('span')[3].text
img = site.format(go)
print('*'*20 , count , '*'*20)
print('Name: {}'.format(name))
print('Price: {}'.format(price))
print('Image Url: {}'.format(img))
row = [count, name, price, img]
with open('curtains.csv', 'a', newline='' , encoding='utf-8') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
csvFile.close()
main("https://www.redbubble.com/shop/shower-curtains?page={}")
解决方案
import requests
import csv
data = {
"operationName": "withSearchResults",
"query": "query withSearchResults($query: String!, $queryParams: QueryParams, $locale: String!, $country: String!, $currency: String!, $previewTypeIds: [String!], $experience: String) {\n searchResults(query: $query, queryParams: $queryParams, locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds, experience: $experience) {\n ...Results\n ...TrendingResults\n ...Metadata\n ...Filters\n ...Pagination\n ...LandingPage\n __typename\n }\n}\n\nfragment Results on SearchResults {\n results {\n inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n id\n description\n productTypeId\n productPageUrl\n blankItemId\n price {\n id\n amount\n currency\n __typename\n }\n previewSet {\n id\n previews {\n previewTypeId\n url\n __typename\n }\n __typename\n }\n gaCode\n gaCategory\n attributes {\n name\n value\n attributes {\n name\n value\n __typename\n }\n __typename\n }\n volumeDiscount {\n id\n thresholds {\n percentOff\n quantity\n __typename\n }\n __typename\n }\n experiencesProductCard {\n name\n value\n __typename\n }\n __typename\n }\n work(locale: $locale) {\n id\n title\n artistName\n isMatureContent\n tags\n __typename\n }\n defaultPreviewTypeId\n groupId\n rank\n __typename\n }\n __typename\n}\n\nfragment TrendingResults on SearchResults {\n trendingResults {\n inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n id\n description\n productPageUrl\n productTypeId\n price {\n id\n amount\n currency\n __typename\n }\n previewSet {\n id\n previews {\n previewTypeId\n url\n __typename\n }\n __typename\n }\n volumeDiscount {\n id\n thresholds {\n percentOff\n quantity\n __typename\n }\n __typename\n }\n gaCode\n gaCategory\n attributes {\n name\n value\n attributes {\n name\n value\n __typename\n }\n __typename\n }\n experiencesProductCard {\n name\n value\n __typename\n }\n __typename\n }\n work(locale: $locale) {\n id\n title\n artistName\n isMatureContent\n tags\n __typename\n }\n defaultPreviewTypeId\n rank\n __typename\n }\n __typename\n}\n\nfragment Metadata on SearchResults {\n metadata {\n title\n searchContext {\n category\n __typename\n }\n resultCount\n topic\n searchBar {\n iaCode\n pillLabel\n keywords\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment Filters on SearchResults {\n filters {\n resetUrl\n staticFilters {\n type\n label\n options {\n name\n label\n applied\n url\n options {\n name\n label\n applied\n url\n __typename\n }\n __typename\n }\n __typename\n }\n filters {\n type\n label\n experiences {\n name\n value\n __typename\n }\n options {\n name\n label\n applied\n disabled\n url\n hexColor\n imageUrl\n __typename\n }\n __typename\n }\n appliedCount\n appliedPath\n resets {\n label\n url\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment Pagination on SearchResults {\n pagination {\n currentPage\n perPage\n showPreviousPageLink\n showNextPageLink\n paginationLinks {\n namedLinks {\n previousPage {\n rel\n url\n __typename\n }\n nextPage {\n rel\n url\n __typename\n }\n __typename\n }\n __typename\n }\n fromNumber\n toNumber\n total\n __typename\n }\n __typename\n}\n\nfragment LandingPage on SearchResults {\n metadata {\n formattedQuery\n landingPage {\n hero {\n pitch\n title\n image\n color\n __typename\n }\n bubbles {\n title\n items {\n title\n image\n realisticImage\n url\n isExternal\n __typename\n }\n hasImages\n __typename\n }\n seoMetadata {\n pageDescription\n robots\n canonicalURL\n searchTitle\n seoImage\n alternatePageVersions {\n href\n locale\n __typename\n }\n relatedTagLinks {\n title\n href\n text\n __typename\n }\n __typename\n }\n footer {\n text\n breadcrumbs {\n name\n url\n __typename\n }\n __typename\n }\n __typename\n }\n relatedTopics {\n title\n url\n __typename\n }\n relatedProducts {\n id\n url\n productTitle\n fullTitle\n __typename\n }\n searchPageType\n resultCount\n searchUUID\n __typename\n }\n __typename\n}\n",
"variables": {"country": "EG", "currency": "USD", "experience": "srp", "locale": "en",
"previewTypeIds": ["product_close", "alternate_product_close", "artwork"],
"query": "shower-curtains", "queryParams": {"page": 0}
}
}
def main(url):
with requests.Session() as req:
with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Price", "IMG"])
for item in range(1, 11):
print(f"Extracting Page# {item}")
data['variables']['queryParams']['page'] = item
r = req.post(url, json=data).json()
for item in r['data']['searchResults']['results']:
writer.writerow([
item['work']['title'],
item['inventoryItem']['price']['amount'],
item['inventoryItem']['previewSet']['previews'][0]['url']
])
main("https://www.redbubble.com/boom/graphql")
输出:在线查看
样本:
推荐阅读
- javascript - javascript中的filter()
- inno-setup - 将窗口调整到尽可能小的尺寸时,在自定义页面上裁剪 TLabel 控件(在 Inno Setup 上)
- javascript - 从 Leaflet 中的动态点设置舒适的地图视图
- android - 如何更改 Android 中的“选择光标”颜色?
- php - 如何将我的 sql 中使用的代码更改为 oci?
- css - 如何使行高拉伸到容器的剩余高度?
- python - 将 Telethon 与 django 一起使用:线程 'Thread-1' 中没有当前事件循环
- python - 如何在 django 中获取有限数量的近期帖子
- python - 通过表单发布数据后未创建 django 对象
- boost - Solr 在应用提升之前标准化分数