python - 从 wunderground 抓取历史天气数据
问题描述
我是数据抓取的新手,最近,我试图通过 selenium 库和 python 从 wunderground.com 抓取数据。但是,我发现有时 selenium web 驱动程序无法成功打开网页,我认为这个问题可能与网站使用的 JavaScript 有点关系,但不确定哪些部分出错了。有谁知道如何解决它?提前致谢。
这是正确显示的示例:正确显示的 示例
这里显示有问题的一个: 有问题的例子
我的代码在这里,这是一个非常简单的 selenium 调用
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ChromeOptions
from selenium.webdriver import ActionChains
import time
# url for scraping
url = "https://www.wunderground.com/history/daily/us/ca/san-diego/KSAN/date/2021-2-1"
# define properties of selenium webdriver
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1}) #value 1 enables it , if you set to 2 it disables it
option.add_argument('--disable-gpu')
option.add_argument("--disable-blink-features")
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument("--enable-javascript")
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(5) # wait for webpage loading
解决方案
该页面将 HTTP GET 发送到:https
://api.weather.com/v1/location/KSAN:9:US/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=e&startDate=
20210201 对这个调用的响应是一个巨大的 JSON包含您要查找的数据。(以下是一个子集)
{
"metadata": {
"language": "en-US",
"transaction_id": "1631220781880:2112944028",
"version": "1",
"location_id": "KSAN:9:US",
"units": "e",
"expire_time_gmt": 1631224381,
"status_code": 200
},
"observations": [
{
"key": "KSAN",
"class": "observation",
"expire_time_gmt": 1612176660,
"obs_id": "KSAN",
"obs_name": "San Diego",
"valid_time_gmt": 1612169460,
"day_ind": "N",
"temp": 59,
"wx_icon": 27,
"icon_extd": 2700,
"wx_phrase": "Mostly Cloudy",
"pressure_tend": 2,
"pressure_desc": "Falling",
"dewPt": 45,
"heat_index": 59,
"rh": 60,
"pressure": 30.04,
"vis": 10,
"wc": 59,
"wdir": null,
"wdir_cardinal": "CALM",
"gust": null,
"wspd": 0,
"max_temp": null,
"min_temp": null,
"precip_total": null,
"precip_hrly": 0,
"snow_hrly": null,
"uv_desc": "Low",
"feels_like": 59,
"uv_index": 0,
"qualifier": null,
"qualifier_svrty": null,
"blunt_phrase": null,
"terse_phrase": null,
"clds": "BKN",
"water_temp": null,
"primary_wave_period": null,
"primary_wave_height": null,
"primary_swell_period": null,
"primary_swell_height": null,
"primary_swell_direction": null,
"secondary_swell_period": null,
"secondary_swell_height": null,
"secondary_swell_direction": null
},
{
"key": "KSAN",
"class": "observation",
"expire_time_gmt": 1612180260,
"obs_id": "KSAN",
"obs_name": "San Diego",
"valid_time_gmt": 1612173060,
"day_ind": "N",
"temp": 59,
"wx_icon": 27,
"icon_extd": 2700,
"wx_phrase": "Mostly Cloudy",
"pressure_tend": null,
"pressure_desc": null,
"dewPt": 47,
"heat_index": 59,
"rh": 64,
"pressure": 30.04,
"vis": 10,
"wc": 59,
"wdir": 260,
"wdir_cardinal": "W",
"gust": null,
"wspd": 5,
"max_temp": null,
"min_temp": null,
"precip_total": null,
"precip_hrly": 0,
"snow_hrly": null,
"uv_desc": "Low",
"feels_like": 59,
"uv_index": 0,
"qualifier": null,
"qualifier_svrty": null,
"blunt_phrase": null,
"terse_phrase": null,
"clds": "BKN",
"water_temp": null,
"primary_wave_period": null,
"primary_wave_height": null,
"primary_swell_period": null,
"primary_swell_height": null,
"primary_swell_direction": null,
"secondary_swell_period": null,
"secondary_swell_height": null,
"secondary_swell_direction": null
} ]
推荐阅读
- python - 使用 Selenium 和 Python 在日历日期选择器中选择特定日期
- r - 如何检查向量中相似元素的位置
- flutter - flutter,TextField怎么用呢?
- reactjs - React 将 Parent 的内部 prop 传递给 Child
- c# - 如何处理 asp.net 邮件列表的 jquery 数组?
- java - 启用 GlobalMethodSecurity 时未显示 Spring Actuator JVM 指标
- java - 找到 1 到 N 之间互为镜像的素数
- php - 扩展布局时,所有@yield 都会被渲染,而不是选择的@sections
- sql - WHERE 子句中的“等于 NULL”条件仍然允许查询执行和使用资源
- javascript - 将数组元素从一种形式格式化为另一种形式-JS