首页 > 解决方案 > 将 xml 文件解析为 csv 时跳过一个空元素

问题描述

我目前正在尝试将约 10,000 个元素的 XML 文件解析为 CSV。

我创建的脚本一直在工作,直到它遇到一个不存在的子元素。我已经尝试过但无法弄清楚如何告诉它跳过丢失的子元素。我尝试了许多错误检查方法,但它们只是破坏了脚本。

import xml.etree.ElementTree as ET
import csv

tree = ET.parse("LNM.xml")
root = tree.getroot()


# open a file for writing

LNM_DATA = open('LNMCSV.csv', 'w')

# create the csv writer object

csvwriter = csv.writer(LNM_DATA)
LNM_head = []
LNM_superhead =[]

count = 0
for DISCREPNACIES in root.iter('DISCREPANCIES'):
    for DISCREPANCY in DISCREPNACIES.findall('DISCREPANCY'):
        for AID in DISCREPANCY.findall('AID'):
            LNM = []
            if count == 0:
                AID_UNIQUE_IDENTIFIER = AID.find('AID_UNIQUE_IDENTIFIER').tag
                LNM_head.append(AID_UNIQUE_IDENTIFIER)
                LIGHT_LIST_NUBMER = AID.find('LIGHT_LIST_NUMBER').tag
                LNM_head.append(LIGHT_LIST_NUBMER)
                USCG_DISTRICT = AID.find('USCG_DISTRICT').tag
                LNM_head.append(USCG_DISTRICT)
                AID_NAME = AID.find('AID_NAME').tag
                LNM_head.append(AID_NAME)
                TYPE = AID.find('TYPE').tag
                LNM_head.append(TYPE)
                LATITUDE = AID.find('ASSIGNED_LATITUDE').tag
                LNM_head.append(LATITUDE)
                LONGITUDE = AID.find('ASSIGNED_LONGITUDE').tag
                LNM_head.append(LONGITUDE)
                csvwriter.writerow(LNM_head)
                count = count + 1

            AID_UNIQUE_IDENTIFIER = AID.find('AID_UNIQUE_IDENTIFIER').text
            LNM.append(AID_UNIQUE_IDENTIFIER)
            AID_UNIQUE_IDENTIFIER = AID.find('AID_UNIQUE_IDENTIFIER').text
            LNM.append(AID_UNIQUE_IDENTIFIER)
            LIGHT_LIST_NUBMER = AID.find('LIGHT_LIST_NUMBER').text
            LNM.append(LIGHT_LIST_NUBMER)
            USCG_DISTRICT = AID.find('USCG_DISTRICT').text
            LNM.append(USCG_DISTRICT)
            AID_NAME = AID.find('AID_NAME').text
            LNM.append(AID_NAME)
            TYPE = AID.find('TYPE').text
            LNM.append(TYPE)
            LATITUDE = AID.find('ASSIGNED_LATITUDE').text
            D = int(LATITUDE[0:2])
            M = int(LATITUDE[4:5])
            S = float(LATITUDE[7:12])
            direction = str(LATITUDE[12])
            DDLAT = D + float(M)/60 + float(S)/3600
            if direction == 'S' or direction == 'W':
                DDLAT *= -1
            LNM.append(DDLAT)
            LONGITUDE = AID.find('ASSIGNED_LONGITUDE').text
            D = int(LONGITUDE[0:3])
            M = int(LONGITUDE[5:6])
            S = float(LONGITUDE[8:13])
            direction = str(LONGITUDE[13])
            DDLONG = D + float(M)/60 + float(S)/3600
            if direction == 'S' or direction == 'W':
                DDLONG *= -1
            LNM.append(DDLONG)
            csvwriter.writerow(LNM)

LNM_DATA.close()

这是 xml 部分的样子:

<AID>
<AID_UNIQUE_IDENTIFIER>200100637244</AID_UNIQUE_IDENTIFIER>
<LIGHT_LIST_NUMBER>31970</LIGHT_LIST_NUMBER>
<USCG_DISTRICT>8</USCG_DISTRICT>
<AID_NAME>Indian Bayou Daybeacon 6</AID_NAME>
<TYPE>PRIVATE</TYPE>
<ASSIGNED_LATITUDE>30-24-41.700N</ASSIGNED_LATITUDE>
<ASSIGNED_LONGITUDE>086-26-55.740W</ASSIGNED_LONGITUDE>
<WATERWAY_NAME>Indian Bayou</WATERWAY_NAME>
</AID>

某些记录中缺少指定的纬度和指定的经度,导致脚本中断。

标签: pythonxmlcsv

解决方案


下面的代码将使用“N/A”来表示缺失的数据。
数据被收集到一个字典列表中

import xml.etree.ElementTree as ET

AID_ELEMENTS = ['AID_UNIQUE_IDENTIFIER', 'LIGHT_LIST_NUMBER', 'USCG_DISTRICT', 'AID_NAME', 'TYPE', 'ASSIGNED_LATITUDE',
                'ASSIGNED_LONGITUDE']

xml = '''<r><AID>
    <AID_UNIQUE_IDENTIFIER>200100637244</AID_UNIQUE_IDENTIFIER>
    <LIGHT_LIST_NUMBER>31970</LIGHT_LIST_NUMBER>
    <USCG_DISTRICT>8</USCG_DISTRICT>
    <AID_NAME>Indian Bayou Daybeacon 6</AID_NAME>
    <TYPE>PRIVATE</TYPE>
    <ASSIGNED_LATITUDE>30-24-41.700N</ASSIGNED_LATITUDE>
    <ASSIGNED_LONGITUDE>086-26-55.740W</ASSIGNED_LONGITUDE>
    <WATERWAY_NAME>Indian Bayou</WATERWAY_NAME>
</AID>
<AID>
    <AID_UNIQUE_IDENTIFIER>200100637244</AID_UNIQUE_IDENTIFIER>
    <LIGHT_LIST_NUMBER>31970</LIGHT_LIST_NUMBER>
    <USCG_DISTRICT>8</USCG_DISTRICT>
    <AID_NAME>Indian Bayou Daybeacon 6</AID_NAME>
    <TYPE>PRIVATE</TYPE>
    <ASSIGNED_LATITUDE>30-24-41.700N</ASSIGNED_LATITUDE>
    <!-- <ASSIGNED_LONGITUDE>086-26-55.740W</ASSIGNED_LONGITUDE> -->
    <WATERWAY_NAME>Indian Bayou</WATERWAY_NAME>
</AID>
</r>'''

aids = []
root = ET.fromstring(xml)
for aid in root.findall('.//AID'):
    entry = {}
    for sub_element_name in AID_ELEMENTS:
        ele = aid.find('.//{}'.format(sub_element_name))
        entry[sub_element_name] = ele.text if ele is not None else 'N/A'
    aids.append(entry)
for aid in aids:
    print(aid)

输出

{'AID_UNIQUE_IDENTIFIER': '200100637244', 'LIGHT_LIST_NUMBER': '31970', 'USCG_DISTRICT': '8', 'AID_NAME': 'Indian Bayou Daybeacon 6', 'TYPE': 'PRIVATE', 'ASSIGNED_LATITUDE': '30-24-41.700N', 'ASSIGNED_LONGITUDE': '086-26-55.740W'}
{'AID_UNIQUE_IDENTIFIER': '200100637244', 'LIGHT_LIST_NUMBER': '31970', 'USCG_DISTRICT': '8', 'AID_NAME': 'Indian Bayou Daybeacon 6', 'TYPE': 'PRIVATE', 'ASSIGNED_LATITUDE': '30-24-41.700N', 'ASSIGNED_LONGITUDE': 'N/A'}

推荐阅读