首页 > 解决方案 > 从 wunderground 抓取历史天气数据

问题描述

我是数据抓取的新手,最近,我试图通过 selenium 库和 python 从 wunderground.com 抓取数据。但是,我发现有时 selenium web 驱动程序无法成功打开网页,我认为这个问题可能与网站使用的 JavaScript 有点关系,但不确定哪些部分出错了。有谁知道如何解决它?提前致谢。

这是正确显示的示例:正确显示的 示例

这里显示有问题的一个: 有问题的例子

我的代码在这里,这是一个非常简单的 selenium 调用

import requests                                                                      
from bs4 import BeautifulSoup 
import pandas as pd
import numpy
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ChromeOptions
from selenium.webdriver import ActionChains
import time

# url for scraping
url = "https://www.wunderground.com/history/daily/us/ca/san-diego/KSAN/date/2021-2-1"

# define properties of selenium webdriver
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1}) #value 1 enables it , if you set to 2 it disables it  
option.add_argument('--disable-gpu')
option.add_argument("--disable-blink-features")
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument("--enable-javascript")
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(5) # wait for webpage loading 

标签: pythonseleniumselenium-webdriverweb-scraping

解决方案


该页面将 HTTP GET 发送到:https
://api.weather.com/v1/location/KSAN:9:US/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=e&startDate= 20210201 对这个调用的响应是一个巨大的 JSON包含您要查找的数据。(以下是一个子集)

{
  "metadata": {
    "language": "en-US",
    "transaction_id": "1631220781880:2112944028",
    "version": "1",
    "location_id": "KSAN:9:US",
    "units": "e",
    "expire_time_gmt": 1631224381,
    "status_code": 200
  },
  "observations": [
    {
      "key": "KSAN",
      "class": "observation",
      "expire_time_gmt": 1612176660,
      "obs_id": "KSAN",
      "obs_name": "San Diego",
      "valid_time_gmt": 1612169460,
      "day_ind": "N",
      "temp": 59,
      "wx_icon": 27,
      "icon_extd": 2700,
      "wx_phrase": "Mostly Cloudy",
      "pressure_tend": 2,
      "pressure_desc": "Falling",
      "dewPt": 45,
      "heat_index": 59,
      "rh": 60,
      "pressure": 30.04,
      "vis": 10,
      "wc": 59,
      "wdir": null,
      "wdir_cardinal": "CALM",
      "gust": null,
      "wspd": 0,
      "max_temp": null,
      "min_temp": null,
      "precip_total": null,
      "precip_hrly": 0,
      "snow_hrly": null,
      "uv_desc": "Low",
      "feels_like": 59,
      "uv_index": 0,
      "qualifier": null,
      "qualifier_svrty": null,
      "blunt_phrase": null,
      "terse_phrase": null,
      "clds": "BKN",
      "water_temp": null,
      "primary_wave_period": null,
      "primary_wave_height": null,
      "primary_swell_period": null,
      "primary_swell_height": null,
      "primary_swell_direction": null,
      "secondary_swell_period": null,
      "secondary_swell_height": null,
      "secondary_swell_direction": null
    },
    {
      "key": "KSAN",
      "class": "observation",
      "expire_time_gmt": 1612180260,
      "obs_id": "KSAN",
      "obs_name": "San Diego",
      "valid_time_gmt": 1612173060,
      "day_ind": "N",
      "temp": 59,
      "wx_icon": 27,
      "icon_extd": 2700,
      "wx_phrase": "Mostly Cloudy",
      "pressure_tend": null,
      "pressure_desc": null,
      "dewPt": 47,
      "heat_index": 59,
      "rh": 64,
      "pressure": 30.04,
      "vis": 10,
      "wc": 59,
      "wdir": 260,
      "wdir_cardinal": "W",
      "gust": null,
      "wspd": 5,
      "max_temp": null,
      "min_temp": null,
      "precip_total": null,
      "precip_hrly": 0,
      "snow_hrly": null,
      "uv_desc": "Low",
      "feels_like": 59,
      "uv_index": 0,
      "qualifier": null,
      "qualifier_svrty": null,
      "blunt_phrase": null,
      "terse_phrase": null,
      "clds": "BKN",
      "water_temp": null,
      "primary_wave_period": null,
      "primary_wave_height": null,
      "primary_swell_period": null,
      "primary_swell_height": null,
      "primary_swell_direction": null,
      "secondary_swell_period": null,
      "secondary_swell_height": null,
      "secondary_swell_direction": null
    } ]

推荐阅读