python - ParseError:格式不正确(无效标记):第 47 行,第 27 列
问题描述
我有一个程序需要 452,793 个 xml 文件将其转换为 pandas 数据框,然后将其转换为 .csv 文件......好吧它假设。它经历了从特定标签中提取信息并将它们插入数据帧中特定位置的过程。但是在 452,793 的文件 6,274 中,它会抛出此错误“ParseError: not well-formed (invalid token): line 47, column 27” 错误消息图片。
我已经尝试添加encoding="utf-8-sig"和encoding="utf-8",但没有成功。
这是我的代码。如果您需要我解释这些,请告诉我。存在缩进错误。如果有任何不正确的缩进,那是因为在导入我的代码时堆栈溢出格式。
import xml.etree.ElementTree as ET
import pandas as pd
import glob as glob
from urllib.request import urlopen as uReq
import urllib.request as ur
import shutil
import zipfile
pd.set_option('display.max_columns', 500)
run = 0
globfiles = glob.glob("C:/Users/bbrown/Projects/Database/NSF/Trash/*.xml")
glob.glob("C:/Users/bbrown/Projects/Database/NSF/Trash/*.xml")
for file in globfiles: #The top portion
run += 1
print('globbing ',run, file)
parser = ET.XMLParser(encoding="utf-8")
tree = ET.parse(file, parser)
root = tree.getroot()
x = []
y = []
for child in root:
for child2 in child:
x.append(child2.tag)
y.append(child2.text)
for i in range(0, len(x)):
if x[i] == 'AwardTitle':
data = [y[i]]
df1 = pd.DataFrame(data, columns = ['AwardTitle'])
if x[i] == 'AwardEffectiveDate':
data = [y[i]]
df2 = pd.DataFrame(data, columns = ['AwardEffectiveDate'])
if x[i] == 'AwardExpirationDate':
data = [y[i]]
df3 = pd.DataFrame(data, columns = ['AwardExpirationDate'])
if x[i] == 'AwardTotalIntnAmount':
data = [y[i]]
df4 = pd.DataFrame(data, columns = ['AwardTotalIntnAmount'])
if x[i] == 'AwardAmount':
data = [y[i]]
df5 = pd.DataFrame(data, columns = ['AwardAmount'])
if x[i] == 'AbstractNarration':
data = [y[i]]
df6 = pd.DataFrame(data, columns = ['AbstractNarration'])
if x[i] == 'MinAmdLetterDate':
data = [y[i]]
df7 = pd.DataFrame(data, columns = ['MinAmdLetterDate'])
if x[i] == 'MaxAmdLetterDate': #index 10
data = [y[i]]
df8 = pd.DataFrame(data, columns = ['MaxAmdLetterDate'])
if x[i] == 'AwardID': #index 12
data = [y[i]]
df9 = pd.DataFrame(data, columns = ['AwardID'])
a = []
b = []
#0,1,6,7,8,9,10...22
for child in root:
for child2 in child:
for child3 in child2:
a.append(child3.tag)
b.append(child3.text)
for i in range(0, len(a)):
if a[i] == 'Value':
data = [b[i]]
df10 = pd.DataFrame(data, columns = ['Value'])
if a[i] == 'Code' and i == 1:
data = [b[i]]
df11 = pd.DataFrame(data, columns = ['Organization Code'])
if a[i] == 'SignBlockName':
data = [b[i]]
df12 = pd.DataFrame(data, columns = ['SignBlockName'])
if a[i] == 'FirstName':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'FirstName':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df13 = pd.DataFrame(data, columns = ['FirstName'])
if a[i] == 'LastName':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'LastName':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df14 = pd.DataFrame(data, columns = ['LastName'])
if a[i] == 'EmailAddress':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'EmailAddress':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df15 = pd.DataFrame(data, columns = ['EmailAddress'])
if a[i] == 'StartDate':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'StartDate':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df16 = pd.DataFrame(data, columns = ['StartDate'])
if a[i] == 'EndDate':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'EndDate':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df17 = pd.DataFrame(data, columns = ['EndDate'])
if a[i] == 'RoleCode':
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'RoleCode':
iterlist.append(x)
x += 1
for eachnum in iterlist:
namelist.append(b[eachnum])
data = [str(namelist)]
df18 = pd.DataFrame(data, columns = ['RoleCode'])
if a[i] == 'Name':
data = [b[i]]
df19 = pd.DataFrame(data, columns = ['Name'])
if a[i] == 'CityName':
data = [b[i]]
df20 = pd.DataFrame(data, columns = ['CityName'])
if a[i] == 'ZipCode':
data = [b[i]]
df21 = pd.DataFrame(data, columns = ['ZipCode'])
if a[i] == 'PhoneNumber':
data = [b[i]]
df22 = pd.DataFrame(data, columns = ['PhoneNumber'])
if a[i] == 'StreetAddress':
data = [b[i]]
df23 = pd.DataFrame(data, columns = ['StreetAddress'])
if a[i] == 'StateName':
data = [b[i]]
df24 = pd.DataFrame(data, columns = ['StateName'])
if a[i] == 'CountryName':
data = [b[i]]
df25 = pd.DataFrame(data, columns = ['CountryName'])
if a[i] == 'StateCode':
data = [b[i]]
df26 = pd.DataFrame(data, columns = ['StateCode'])
if a[i] == 'Code' and i != 1:
x = 0
iterlist = []
namelist = []
for word in a:
if word == 'Code':
iterlist.append(x)
x += 1
for eachnum in iterlist:
if eachnum != 1:
value = b[eachnum],b[eachnum+1]
namelist.append(value)
data = [str(namelist)]
df27 = pd.DataFrame(data, columns = ['Codes'])
aaa=[]
bbb=[]
for child in root:
for child2 in child:
aa.append(child2.tag)
bb.append(child2.text)
for child3 in child2:
a.append(child3.tag)
b.append(child3.text)
for child4 in child3:
aaa.append(child4.tag)
bbb.append(child4.text)
dfvar = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18,df19, df20, df21, df22, df23, df24, df25, df26, df27], axis=1)
df = df.append(dfvar)
**您可以从这里下载 xml 文件的样本* - https://www.nsf.gov/awardsearch/download?DownloadFileName=2019&All=true
我只需要它停止抛出错误。通过修复导致它的任何原因,或忽略错误。否则这是黄金工作!
解决方案
推荐阅读
- ms-access - 需要有关我的更新查询的建议
- java - 从 Firebase Android 读取值并更新它
- bash - 使用来自 API 的数据更新提交消息的 Git 钩子
- c# - 如何将我自己的 Iframe BotFramework (dot Net) 发布到 Azure?
- swift - 通过在函数调用中插入字符来调用 Swift 函数
- bash - 并行化调用多个文件并使用 bcp 的 bash 脚本
- javascript - 如何在 Javascript 中重新排序子标签
- angular - 等待 Promise 时未显示角忙光标
- multithreading - 如何在go中转换以下Thread语句
- python - 我的 python 代码不会将视频帧保存为图像