python - 无法在谷歌上抓取图像，因为它在使用 python 抓取网页时更改了 URL

问题描述

我正在尝试使用 Python 中的 Beautiful Soup 开发我的第一个网络爬虫。目的是让爬虫询问用户输入，进行正常的谷歌图像搜索并下载所有需要数量的图像。之前是 rg-meta 标签更改为 rg_i Q4LuWd ，对代码进行了更改。它仍然无法抓取图像。查找和下载图像还需要进行哪些更改。没有发现错误或异常，程序运行但找不到图像的url

import os
import json 
import requests # to sent GET requests
from bs4 import BeautifulSoup # to parse HTML

# user can input a topic and a number
# download first n images from google image search

GOOGLE_IMAGE = \
    'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'

# The User-Agent request header contains a characteristic string 
# that allows the network protocol peers to identify the application type, 
# operating system, and software version of the requesting software user agent.
# needed for google search
usr_agent = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive',
}
SAVE_FOLDER = 'images'

def main():
    if not os.path.exists(SAVE_FOLDER):
        os.mkdir(SAVE_FOLDER)
    download_images()
    
def download_images():
    # ask for user input
    data = input('What are you looking for? ')
    n_images = int(input('How many images do you want? '))

    print('Start searching...')
    
    # get url query string
    searchurl = GOOGLE_IMAGE + 'q=' + data
    print(searchurl)

    # request url, without usr_agent the permission gets denied
    response = requests.get(searchurl, headers=usr_agent)
    html = response.text

    # find all divs where class='rg_i Q4LuWd'
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.findAll('div', {'class': 'rg_i Q4LuWd'},limit=n_images)
**Earlier it was rg-meta tag that changed to rg_i Q4LuWd**
    print(results)
    # extract the link from the div tag
    imagelinks= []
    for re in results:
        text = re.text # this is a valid json string
        text_dict= json.loads(text) # deserialize json to a Python dict
        link = text_dict['ou']
        # image_type = text_dict['ity']
        imagelinks.append(link)

    print(f'found {len(imagelinks)} images')
    print('Start downloading...')

    for i, imagelink in enumerate(imagelinks):
        # open image link and save as file
        response = requests.get(imagelink)
        
        imagename = SAVE_FOLDER + '/' + data + str(i+1) + '.jpg'
        with open(imagename, 'wb') as file:
            file.write(response.content)

    print('Done')


if __name__ == '__main__':
    main()

标签： pythonweb-scrapingbeautifulsoupgoogle-image-search

解决方案

这是因为图像 URL 位于<script>标签中，为了获取它们，您需要使用它们regex来匹配、提取和解码它们。另外，我不确定如何在input().

查找所有 <script>标签：

soup.select('script')

通过以下方式匹配图像数据regex：

matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

通过以下方式匹配所需的图像（全分辨率大小）regex：

matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                    matched_images_data)

bytes()使用and提取和解码它们decode()：

for fixed_full_res_image in matched_google_full_resolution_images:
    original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
    original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')

要保存图像，您可以使用urllib.request.urlretrieve(url, filename)（更深入）：

# often times it will throw 404 error, so to avoid it we need to pass user-agent

opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
urllib.request.install_opener(opener)

urllib.request.urlretrieve(original_size_img, f'LOCAL_FOLDER_NAME/YOUR_IMAGE_NAME.jpg')

在线IDE中的代码和完整示例，刮得更多（尝试慢慢阅读）：

import requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
    "q": "pexels cat",
    "tbm": "isch", 
    "hl": "en",
    "ijn": "0",
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')


def get_images_data():

    print('\nGoogle Images Metadata:')
    for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
        title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
        source = google_image.select_one('.fxgdke').text
        link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
        print(f'{title}\n{source}\n{link}\n')

    # this steps could be refactored to a more compact
    all_script_tags = soup.select('script')

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    # if you try to json.loads() without json.dumps it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/pdZOnW/3
    matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ', '.join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(', ')

    print('Google Image Thumbnails:')  # in order
    for fixed_google_image_thumbnail in matched_google_images_thumbnails:
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')

        # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
        google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
        print(google_image_thumbnail)

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                       removed_matched_google_images_thumbnails)


    print('\nDownloading Google Full Resolution Images:')  # in order
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
        original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
        print(original_size_img)

        # ------------------------------------------------
        # Download original images

        print(f'Downloading {index} image...')
        
        opener=urllib.request.build_opener()
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(original_size_img, f'Images/original_size_img_{index}.jpg')


get_images_data()


-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
...

Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
...

Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
Downloaded 0 image...
https://images.pexels.com/photos/3777622/pexels-photo-3777622.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
Downloaded 1 image...
...
'''

或者，您可以使用来自 SerpApi的Google 图片 API来实现相同的目的。这是一个带有免费计划的付费 API。

您的情况的不同之处在于您不必处理regex从页面的源代码中匹配和提取所需数据，相反，您只需要遍历结构化 JSON 并获得您想要的。

要集成的代码：

import os, urllib.request, json # json for pretty output
from serpapi import GoogleSearch


def get_google_images():
    params = {
      "api_key": os.getenv("API_KEY"),
      "engine": "google",
      "q": "pexels cat",
      "tbm": "isch"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    # print(json.dumps(results['suggested_searches'], indent=2, ensure_ascii=False))
    print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))

    # -----------------------
    # Downloading images

    for index, image in enumerate(results['images_results']):

        print(f'Downloading {index} image...')
        
        opener=urllib.request.build_opener()
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(image['original'], f'SerpApi_Images/original_size_img_{index}.jpg')


get_google_images()

---------------
'''
[
...
  {
    "position": 100, # img number
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
    "source": "pexels.com",
    "title": "Close-up of Cat · Free Stock Photo",
    "link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
    "original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
    "is_product": false
  }
]

Downloading 0 image...
Downloading 1 image...
Downloading 2 image...
'''

PS-我写了一篇关于如何抓取Google Imgaes的更详细的博客文章。

免责声明，我为 SerpApi 工作。

python - 无法在谷歌上抓取图像，因为它在使用 python 抓取网页时更改了 URL

问题描述

解决方案

推荐阅读