python - 无法在谷歌上抓取图像,因为它在使用 python 抓取网页时更改了 URL
问题描述
我正在尝试使用 Python 中的 Beautiful Soup 开发我的第一个网络爬虫。目的是让爬虫询问用户输入,进行正常的谷歌图像搜索并下载所有需要数量的图像。之前是 rg-meta 标签更改为 rg_i Q4LuWd ,对代码进行了更改。它仍然无法抓取图像。查找和下载图像还需要进行哪些更改。没有发现错误或异常,程序运行但找不到图像的url
import os
import json
import requests # to sent GET requests
from bs4 import BeautifulSoup # to parse HTML
# user can input a topic and a number
# download first n images from google image search
GOOGLE_IMAGE = \
'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
# The User-Agent request header contains a characteristic string
# that allows the network protocol peers to identify the application type,
# operating system, and software version of the requesting software user agent.
# needed for google search
usr_agent = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
}
SAVE_FOLDER = 'images'
def main():
if not os.path.exists(SAVE_FOLDER):
os.mkdir(SAVE_FOLDER)
download_images()
def download_images():
# ask for user input
data = input('What are you looking for? ')
n_images = int(input('How many images do you want? '))
print('Start searching...')
# get url query string
searchurl = GOOGLE_IMAGE + 'q=' + data
print(searchurl)
# request url, without usr_agent the permission gets denied
response = requests.get(searchurl, headers=usr_agent)
html = response.text
# find all divs where class='rg_i Q4LuWd'
soup = BeautifulSoup(html, 'html.parser')
results = soup.findAll('div', {'class': 'rg_i Q4LuWd'},limit=n_images)
**Earlier it was rg-meta tag that changed to rg_i Q4LuWd**
print(results)
# extract the link from the div tag
imagelinks= []
for re in results:
text = re.text # this is a valid json string
text_dict= json.loads(text) # deserialize json to a Python dict
link = text_dict['ou']
# image_type = text_dict['ity']
imagelinks.append(link)
print(f'found {len(imagelinks)} images')
print('Start downloading...')
for i, imagelink in enumerate(imagelinks):
# open image link and save as file
response = requests.get(imagelink)
imagename = SAVE_FOLDER + '/' + data + str(i+1) + '.jpg'
with open(imagename, 'wb') as file:
file.write(response.content)
print('Done')
if __name__ == '__main__':
main()
解决方案
这是因为图像 URL 位于<script>
标签中,为了获取它们,您需要使用它们regex
来匹配、提取和解码它们。另外,我不确定如何在input()
.
查找所有 <script>
标签:
soup.select('script')
通过以下方式匹配图像数据regex
:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
通过以下方式匹配所需的图像(全分辨率大小)regex
:
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data)
bytes()
使用and提取和解码它们decode()
:
for fixed_full_res_image in matched_google_full_resolution_images:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
要保存图像,您可以使用urllib.request.urlretrieve(url, filename)
(更深入):
# often times it will throw 404 error, so to avoid it we need to pass user-agent
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(original_size_img, f'LOCAL_FOLDER_NAME/YOUR_IMAGE_NAME.jpg')
在线IDE中的代码和完整示例,刮得更多(尝试慢慢阅读):
import requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nDownloading Google Full Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
# ------------------------------------------------
# Download original images
print(f'Downloading {index} image...')
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(original_size_img, f'Images/original_size_img_{index}.jpg')
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
Downloaded 0 image...
https://images.pexels.com/photos/3777622/pexels-photo-3777622.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
Downloaded 1 image...
...
'''
或者,您可以使用来自 SerpApi的Google 图片 API来实现相同的目的。这是一个带有免费计划的付费 API。
您的情况的不同之处在于您不必处理regex
从页面的源代码中匹配和提取所需数据,相反,您只需要遍历结构化 JSON 并获得您想要的。
要集成的代码:
import os, urllib.request, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
# print(json.dumps(results['suggested_searches'], indent=2, ensure_ascii=False))
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
# -----------------------
# Downloading images
for index, image in enumerate(results['images_results']):
print(f'Downloading {index} image...')
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(image['original'], f'SerpApi_Images/original_size_img_{index}.jpg')
get_google_images()
---------------
'''
[
...
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
Downloading 0 image...
Downloading 1 image...
Downloading 2 image...
'''
PS-我写了一篇关于如何抓取Google Imgaes的更详细的博客文章。
免责声明,我为 SerpApi 工作。
推荐阅读
- ssl - 无法与服务器建立安全连接
- gerrit - 如何隐藏 Gerrit 网页中的所有存储库?
- javascript - 无法读取未定义的属性状态
- electron - 让 paperjs 在电子应用程序中工作
- c# - ASP.NET Core 3.1 - 如何在经过身份验证后持久保存 JWT 令牌
- javascript - 如何获取鼠标单击 div 的宽度?
- ios - 如何打开新浪微博用户资料?
- jquery - AJAX 加载后 jQuery 事件未触发。需要点击两次才能触发事件
- r - 使用 R 求解 4 个未知数中的 4 个方程组
- ssh - 关于如何在 Ubuntu 18.04 LTS 上通过 ssh 进入远程服务器的问题