python - 操作数应该只包含一列错误 - to_sql() python mysql
问题描述
我似乎无法弄清楚我遇到的错误,
我基本上是在网络爬取一个站点,并希望每次访问一系列站点时都将行填充到 mysql 中。它可以按预期很好地打印到 excel 表中,但对于 mysql 却没有这样做,因为它面临 operationError 操作数应仅包含 1 列
添加了几行,发现数据框有 1 行和 13 列,这可能是错误的原因 - 操作数应该只包含一列错误
结果错误行位于:
df.to_sql(name='stamprallydb_crawl_result',
con=engine, if_exists='append', index=False)
将有助于开始阅读代码
代码如下,谢谢
import pymysql
from sqlalchemy import create_engine
pymysql.install_as_MySQLdb()
from Stamprally import StamprallyInfo
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import os
import sys
import time
import math
import urllib.request
import numpy as np
import pandas as pd
import re
import MySQLdb
# Start Recording Program Loading Time
programStart = time.time()
prefectureNameList = ["北海道", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県",
"愛知県", "三重県", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "徳島県", "香川県", "愛媛県", "高知県", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"]
#prefectureNameList = ["東京都"]
data = []
timeStampData = []
#timeStampDataAggregate = []
contentAggregator = []
timeStampData.append("프로그램 시작")
timeStampData.append(programStart)
# db = Db()
main_url = 'https://stamprally.org/'
# 스탬프랠리 정보를 담는 리스트
# stamprally_list = []
# 드라이브 로드
# For MAC users driver = wd.Chrome(executable_path='./chromedriver')
# End Recording Program Loading Time
programEnd = time.time()
timeStampData.append(programEnd - programStart)
# timeStampDataAggregate.append(timeStampData)
timeStamp = pd.DataFrame(np.array([timeStampData]), columns=[
'설명', 'TimeStamp', '소요기간'])
print(timeStamp)
timeStampData.clear()
print(timeStampData)
# Start Recording Selenium Driver Loading
print("셀레니엄 ")
timeStampData.append("셀레니엄 드라이버 로딩")
seleniumStart = time.time()
timeStampData.append(seleniumStart)
driver = wd.Chrome(executable_path='chromedriver.exe')
driver.get(main_url)
seleniumEnd = time.time()
timeStampData.append(seleniumEnd - seleniumStart)
print(timeStampData)
rowAddTimeStampSelenium = pd.Series(timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(rowAddTimeStampSelenium, ignore_index=True)
# timeStampDataAggregate.append(timeStampData)
timeStampData.clear()
# =============================================================================================
prefectureValueStorage = [x.get_attribute('value') for x in driver.find_elements_by_xpath(
"//select[@name='search_cat1']/option[@class='level-1']")]
# =============================================================================================
prefectureNameIterator = -1
print("START OF PREFECTURE " + prefectureNameList[prefectureNameIterator])
# Loop through all the different prefectures
for prefectureValue in prefectureValueStorage:
print("Prefecture Start Time")
prefectureStart = time.time()
prefectureNameIterator += 1
# Add Prefecture Name to timeStampData
timeStampData.append(prefectureNameList[prefectureNameIterator])
timeStampData.append(prefectureStart)
print(timeStampData)
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0")
# Calculate How Many Times To Run Page Loop
imageDownloadCounter = 1
totalList = driver.find_element_by_css_selector(
'div.page_navi2.clearfix>p').text # get_attribute('text')
totalListNum = totalList.split("件中")
# Add TotalListNum to the contentAggregator
# contentAggregator.append(int(totalListNum[0]))
if int(totalListNum[0]) % 10 != 0:
pageLoopCount = math.ceil((int(totalListNum[0])/10))
else:
pageLoopCount = int(totalListNum[0])/10
# continue
currentpage = 0
while currentpage < pageLoopCount:
currentpage += 1
print("Current Page " + str(currentpage))
# # Loop through all the Listings within the prefecture page
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0&paged={currentpage}")
# print("Loading Page %s" % currentpage)
# ========================================================================================================================================================
# Add prefectureName to the contentAggregator
# contentAggregator.append(prefectureNameList[prefectureNameIterator])
# Gather All List Links
urlList = []
currentUrlCounter = 0
listURLContainer = driver.find_elements_by_css_selector(
'#post_list2 > li > a')
# Put all the lists in one Array
for url in listURLContainer:
urlList.append(url.get_attribute('href'))
# Loop through all the links
for listURL in listURLContainer:
contentAggregator = []
# Add TotalListNum to the contentAggregator
contentAggregator.append(int(totalListNum[0]))
# Add prefectureName to the contentAggregator
contentAggregator.append(
prefectureNameList[prefectureNameIterator])
#print('article Link: ')
# print(urlList[currentUrlCounter])
# Add listLink to the contentAggregator
contentAggregator.append(
urlList[currentUrlCounter])
# for Each Links in listURLContainer:
driver.get(urlList[currentUrlCounter])
currentUrlCounter += 1
locationTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_meta_top']/li[1]/a[@class='cat-category']")]
# print(locationTag)
# Add locationTag to the contentAggregator
contentAggregator.append(locationTag)
eventTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_meta_top']/li[2]/a[@class='cat-category2']")]
# Add eventTag to the contentAggregator
contentAggregator.append(eventTag)
# Select, Post Date & Remove Non-Text Variable
availablePeriod = (driver.find_element_by_css_selector(
'div#post_date')).text.split("( ")
availablePeriodFormatted = availablePeriod[0].replace("開催期間:", "")
availableStartDate = availablePeriod[0].split(" ~ ")
endDate = availableStartDate[1]
availableStartDateFormatted = availableStartDate[0].replace(
"開催期間:", "")
# Select Latest Update Date
lastUpdatedDate = driver.find_element_by_css_selector(
'time.entry-date.updated').text
#print("Available Period:")
# print(availablePeriodFormatted)
# Add Available Period to the contentAggregator
contentAggregator.append(availablePeriodFormatted)
#print("Available StartDate:")
# print(availableStartDateFormatted)
# Add Available StartDate to the contentAggregator
contentAggregator.append(availableStartDateFormatted)
#print("End Date: ")
# print(endDate)
# Add endDate to the contentAggregator
contentAggregator.append(endDate)
#print("Last Updated:")
# print(lastUpdatedDate[6:])
# Add lastUpdatedDate to the contentAggregator
contentAggregator.append(lastUpdatedDate[6:])
# ========================================================================================================================================================
# Download Main Post Image
mainImageUrl = driver.find_element_by_css_selector(
'img.attachment-post-thumbnail.size-post-thumbnail.wp-post-image').get_attribute('src')
# Add lastUpdatedDate to the contentAggregator
contentAggregator.append(mainImageUrl)
# Save Post Main Title
# postTitle = driver.find_element_by_xpath(
# "//*[@id='post_title']/text()")
postTitle1 = driver.find_element_by_css_selector(
'h2#post_title').text.replace("開催終了", "")
postTitle = postTitle1.replace("ただいま開催中", "")
# Replace all special characters in a string with empty string
# Pass the string in search
# specialCharacter = "!@#$%^&*()-+?_=,<>/"
# if any(character in specialCharacter for character in postTitle):
removeSpecialChars = postTitle.translate(
{ord(c): " " for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"})
postTitle = removeSpecialChars
print(postTitle)
# else:
# continue
# Add Title to the contentAggregator
contentAggregator.append(postTitle)
#print("Title: ")
# print(postTitle)
# Find out Event validty status
eventValidity = driver.find_element_by_xpath(
"//*[@id='post_title']/span").text
# Add eventValidity to the contentAggregator
contentAggregator.append(eventValidity)
# Save Post Main Image
urllib.request.urlretrieve(mainImageUrl, (str(
prefectureNameList[prefectureNameIterator])+postTitle+str(imageDownloadCounter) + ".png"))
imageDownloadCounter += 1
# Get Inner Website Link
innerWebSiteButtonURL = driver.find_element_by_css_selector(
'div.post_content.clearfix > div >a').get_attribute('href')
#print("inner Website Button URL: " + innerWebSiteButtonURL)
# Add innerWebSiteURL to the contentAggregator
contentAggregator.append(innerWebSiteButtonURL)
# Gather Main Post Text Content
mainText = driver.find_elements_by_css_selector(
'div.post_content.clearfix > p')
mainContentText = []
# Remove Disclamimer text
for mainContentDetail in mainText:
mainContentText.append(mainContentDetail.text)
mainContextTextCount = len(mainContentText)-1
# print(mainContentText[:mainContextTextCount])
# Add Main Post Text Content to the contentAggregator
contentAggregator.append(mainContentText[:mainContextTextCount])
# ========================================================================================================================================================
contentReorder = [1, 0, 10, 11, 5, 6, 7, 8, 13, 3, 4, 9, 12, 2]
contentAggregator = [contentAggregator[i] for i in contentReorder]
print("=====================================================================================================================================================")
# print(contentAggregator)
data.append(contentAggregator)
# print(data)
# print(pd.DataFrame(data, columns=["Total List Number", "Prefecture", "ListLink", "Location Tag", "Event Tag", "Available Period",
# "Available StartDate", "End Date", "Last Updated", "Main Image URL", "Title","Event Validty" "innerWebSiteURL", "mainText"]))
df = pd.DataFrame(data, columns=["Prefecture", "Total List Number", "Title", "Event Validity", "Available Period", "Available StartDate",
"End Date", "Last Updated", "mainText", "Location Tag", "Event Tag", "Main Image URL", "innerWebSiteURL", "ListLink"])
df2 = pd.DataFrame(data)
print = df2
# try:
engine = create_engine(
# try:
engine = create_engine(
"mysql+mysqldb://root:abcdefgH1@localhost/stamprallydb", encoding='utf-8')
conn = engine.connect()
df.to_sql(name='stamprallydb_crawl_result',
con=engine, if_exists='append', index=False)
# except:
# print("Error in Mysql connection")
else:
prefectureEnd = time.time()
timeStampData.append(prefectureEnd-prefectureStart)
rowAddTimeStampPrefecture = pd.Series(
timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(
rowAddTimeStampPrefecture, ignore_index=True)
timeStampData.clear()
# timeStampDataAggregate.append(timeStampData)
excelFileStart = time.time()
xlwriter = pd.ExcelWriter('StampRally_Crawler.xlsx')
df.to_excel(xlwriter, sheet_name="Stamprally.org Crawl Result")
# Add last Series "엑셀 파일 저장"
excelFileEnd = time.time()
timeStampData.append("엑셀 파일 저장")
timeStampData.append(excelFileStart)
timeStampData.append(excelFileEnd-excelFileStart)
rowAddTimeStampPrefecture = pd.Series(timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(rowAddTimeStampPrefecture, ignore_index=True)
# timeStampDataAggregate.append(timeStampData)
# Create New sheet and write to Excel
timeStamp.to_excel(xlwriter, sheet_name="TimeStamp Result")
xlwriter.close()
# Close Off
driver.close()
driver.quit()
sys.exit()
解决方案
推荐阅读
- python - 有谁知道如何解决这个奇怪的 cv2.error: OpenCV(4.1.0)?
- java - 如何使用 Spring 正确连接到 MongoDB?
- javascript - 由于异步回调导致的空变量
- python - 使用正则表达式第一列在 Python 中读取和解析 CSV 文件
- composer-php - 仅配置 --no-dev 的作曲家
- azure-language-understanding - 如何使用 LUIS 管理问题的无穷变化?
- specifications - 创建子通道时应该何时使用 VK_ATTACHMENT_UNUSED?
- apache-kafka - Flume Kafka Channel 是否总是仅限于单个内部 Kafka 消费者?
- javascript - 单击 Buefy 选项卡时如何触发方法?
- r - 加载数据集时解析错误