python - 抓取的数据不会保存到 csv 文件,因为它一直返回一个空白的 csv 文件。
问题描述
谁能看到这段代码有什么问题?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
import csv
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[@id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
因此,在下面的下一点,我试图将数据保存到 df,但它在打开时返回一个空白文档。df = open('jan1.csv', 'w+') 是否应该不将抓取的数据存储到 csv 文件中。我显然错过了一些东西,但看不到什么。
def main():
df = open('jan1.csv', 'w+')
df.close()
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()
解决方案
您的代码似乎在几个地方都损坏了,即使修复它,我也会收到超时错误。
尝试以下步骤:
添加 pandas 以方便数据处理:
import pandas as pd
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[@id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
# add empty list to save scraped data
data = []
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
data.append([horseName, times])
print()
driver.close()
# return your data!
return data
然后在你的 main 函数中改变它:
def main():
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
tmp = []
for race in races(main_url):
url = main_url + '/' + race
print(url)
tmp.append(scrape(url))
df = pd.DataFrame(tmp)
df.to_csv("jan1.csv")
或者,如果您只想坚持使用 csv(没有 pandas):
with open("jan1.csv", "w+") as file:
file.write(your_data_var_here)
推荐阅读
- spring-boot - 将未经身份验证的用户限制为首页结果
- laravel - 多条路线定义得到 404
- ag-grid - ag grid : js : 如何找到应用了哪个过滤器
- c# - 如何使用 Word API 查找行号?
- android - ListView JSON 数组不起作用
- c# - .Net Datagridview 不显示表情符号
- java - 在带有 lambda 参数的 lambdas 中使用谓词
- python - 使用 Pandas 将值从一列复制到另一列
- python-2.7 - 面临错误:ProgrammingError:LOB 变量在后续提取后不再有效
- html - Firefox padding bottom box-sizing: border-box