python - 使用 python 将多表 XML 转换为 Excel
问题描述
我有数百个 XML 文件需要转换为 XLSX 以进行下游分析,但我有太多的手动操作。我一直在尝试找出一种 python 方法来做到这一点,并找到了许多使用 xml2xlsx 或 xml.etree.ElementTree 的教程。但是,我遇到的问题是所有这些都在处理单张纸,而我的 xml 有 3 张纸(例如,请参见下面的部分)。如果有人知道如何做到这一点,或者我应该如何解析它,我将不胜感激。
<?mso-application progid="Excel.Sheet"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
xmlns:html="http://www.w3.org/TR/REC-html40">
<ExcelWorkbook xmlns="urn:schemas-microsoft-com:office:excel">
<ProtectStructure>False</ProtectStructure>
<ProtectWindows>False</ProtectWindows>
</ExcelWorkbook>
<Worksheet ss:Name="Nodes">
<Table>
<Row>
<Cell><Data ss:Type="String">Node ID</Data></Cell>
<Cell><Data ss:Type="String">X Coord</Data></Cell>
<Cell><Data ss:Type="String">Y Coord</Data></Cell>
<Cell><Data ss:Type="String">Z Coord</Data></Cell>
<Cell><Data ss:Type="String">Coordination Number</Data></Cell>
</Row>
<Row>
<Cell><Data ss:Type="Number">0</Data></Cell>
<Cell><Data ss:Type="Number">12.25489235</Data></Cell>
<Cell><Data ss:Type="Number">21.835989</Data></Cell>
<Cell><Data ss:Type="Number">6.916931152</Data></Cell>
<Cell><Data ss:Type="Number">1</Data></Cell>
</Row>
...
</Table>
<WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
<ProtectObjects>False</ProtectObjects>
<ProtectScenarios>False</ProtectScenarios>
</WorksheetOptions>
</Worksheet>
<Worksheet ss:Name="Points">
<Table>
<Row>
<Cell><Data ss:Type="String">Point ID</Data></Cell>
<Cell><Data ss:Type="String">thickness</Data></Cell>
<Cell><Data ss:Type="String">X Coord</Data></Cell>
<Cell><Data ss:Type="String">Y Coord</Data></Cell>
<Cell><Data ss:Type="String">Z Coord</Data></Cell>
</Row>
...
</Table>
<WorksheetOptions xmlns="urn:schemas-microsoft-com:office:excel">
<ProtectObjects>False</ProtectObjects>
<ProtectScenarios>False</ProtectScenarios>
</WorksheetOptions>
</Worksheet>
<Worksheet ss:Name="Segments">
<Table>
<Row>
<Cell><Data ss:Type="String">Segment ID</Data></Cell>
<Cell><Data ss:Type="String">Node ID #1</Data></Cell>
<Cell><Data ss:Type="String">Node ID #2</Data></Cell>
<Cell><Data ss:Type="String">Point IDs</Data></Cell>
</Row>
...
这是我到目前为止所尝试的,首先使用 xml2xlsx,它返回一个空的 excel 表:
from xml2xlsx import xml2xlsx
from lxml import etree
import sys
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--files', nargs = '+', required = True, help='one or more xml files to convert')
#parser.add_argument('-c', '--conditions', nargs = '+', default = ['Condition 1', 'Condition 2'], help = 'Names of conditions to analyze. replace any spaces in filename with underscores.')
args = parser.parse_args()
for file in args.files:
filename = os.path.splitext(os.path.basename(file))[0].lower()
filename = filename.replace(' ', '_')
output = filename+'.xlsx'
with open(file) as xml_file:
template = xml_file.read().encode()
with open(output, 'wb') as xlsx_file:
xlsx_file.write(xml2xlsx(template))
然后这个预先编写的代码,我无法开始工作。
#XML TO EXCEL FILE
import xml.etree.ElementTree as ET
from openpyxl import Workbook
import os
import sys
import argparse
def readFile(filename):
'''
Checks if file exists, parses the file and extracts the needed data
returns a 2 dimensional list without "header"
'''
if not os.path.exists(filename): return
tree = ET.parse(filename)
root = tree.getroot()
#you may need to adjust the keys based on your file structure
dict_keys = ["id","first_name","last_name","email","gender","ip_address" ] #all keys to be extracted from xml
mdlist = []
for child in root:
temp = []
for key in dict_keys:
temp.append(child.find(key).text)
mdlist.append(temp)
return mdlist
def to_Excel(mdlist):
'''
Generates excel file with given data
mdlist: 2 Dimenusional list containing data
'''
wb = Workbook()
ws = wb.active
for i,row in enumerate(mdlist):
for j,value in enumerate(row):
ws.cell(row=i+1, column=j+1).value = value
newfilename = os.path.abspath("./xml_to_excel.xlsx")
wb.save(newfilename)
print("complete")
return
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--files', nargs = '+', required = True, help='one or more xml files to convert')
#parser.add_argument('-c', '--conditions', nargs = '+', default = ['Condition 1', 'Condition 2'], help = 'Names of conditions to analyze. replace any spaces in filename with underscores.')
args = parser.parse_args()
for f in args.files:
result = readFile(f)
if result:
to_Excel(result)
最后,这个,它吐出了一长串无意义的标签。
import xml.etree.ElementTree as ET
tree = ET.parse(r'pathToXML')
root = tree.getroot()
tag = root.tag
att = root.attrib
#Flatten XML to CSV
for child in root:
for subchild in child:
mainlevel = child.tag
xmltocsv = ''
for elem in root.iter():
if elem.tag == root.tag:
continue
if elem.tag == mainlevel:
xmltocsv = xmltocsv + '\n'
xmltocsv = xmltocsv + str(elem.tag).rstrip() + str(elem.attrib).rstrip() + ';' + str(elem.text).rstrip() + ';'
with open('output.csv', 'w') as file:
file.write(xmltocsv)
解决方案
尝试这个。
from openpyxl import Workbook
from simplified_scrapy import SimplifiedDoc, utils
def readFile(filename):
xml = utils.getFileContent(filename)
doc = SimplifiedDoc(xml)
tables = doc.selects('Worksheet').selects('Row').selects('Cell').text # Get all data
sheetNames = doc.selects('Worksheet>ss:Name()') # Get sheet name
return sheetNames,tables
def to_Excel(sheetNames,tables):
wb = Workbook() # Create Workbook
for i in range(len(sheetNames)):
worksheet = wb.create_sheet(sheetNames[i]) # Create sheet
for row in tables[i]:
worksheet.append(row)
wb.save('xml_to_excel.xlsx') # Save file
to_Excel(*readFile("pathToXML"))
推荐阅读
- python - 两堆游戏(黑客排名问题)算法
- algorithm - 寻找缺失的排列
- c++ - 默认 ctor 没有匹配的调用
- java - Joda LocalDate 比较是否相等或之前和之后
- scala - 使用 Spark Scala 进行区间合并
- python - 如何使用 7-zip 命令(或任何替代解决方案)获取 RAR 文件卷索引
- python - Jupyter notebook 在终端中显示输出
- sql-server - SQL-Server 2019 之后具有 IMAGE 数据类型的列会发生什么情况?
- vb.net - 查询ArrayList返回Arraylist的索引
- charts - Vuetify 迷你图抛出错误“无法读取未定义的属性 'getTotalLength'”