首页 > 解决方案 > 使用 Python 将带有 xsd 的 xml 解析为 CSV?

问题描述

我正在尝试解析从 OSHA 网站下载的一个非常大的 XML 文件并将其转换为 CSV,以便我可以在 SQLite 数据库中使用它以及其他一些电子表格。我只会使用在线转换器,但osha 文件显然对所有人来说都太大了。

我用 Python 写了一个脚本,看起来像这样:

import csv
import xml.etree.cElementTree as ET
tree = ET.parse('data.xml')
root = tree.getroot()

xml_data_to_csv =open('Out.csv', 'w')

list_head=[]

Csv_writer=csv.writer(xml_data_to_csv)

count=0
for element in root.findall('data'): 
    List_nodes =[]

    if count== 0:
        inspection_number = element.find('inspection_number').tag
        list_head.append(inspection_number)
        
        establishment_name = element.find('establishment_name').tag
        list_head.append(establishment_name)
        
        city = element.find('city')
        list_head.append(city)

        state = element.find('state')
        list_head.append(state)
        
        zip_code = element.find('zip_code')
        list_head.append(zip_code)
        
        sic_code = element.find('sic_code')
        list_head.append(sic_code)
        
        naics_code = element.find('naics_code')
        list_head.append(naics_code)
        
        sampling_number = element.find('sampling_number')
        list_head.append(sampling_number)
        
        office_id = element.find('office_id')
        list_head.append(office_id)
        
        date_sampled = element.find('date_sampled')
        list_head.append(date_sampled)
        
        date_reported = element.find('date_reported')
        list_head.append(date_reported)
        
        eight_hour_twa_calc = element.find('eight_hour_twa_calc')
        list_head.append(eight_hour_twa_calc)
        
        instrument_type = element.find('instrument_type')
        list_head.append(instrument_type)
        
        lab_number = element.find('lab_number')
        list_head.append(lab_number)
        
        field_number = element.find('field_number')
        list_head.append(field_number)
        
        sample_type = element.find('sample_type')
        list_head.append(sample_type)
        
        blank_used = element.find('blank_used')
        list_head.append(blank_used)
        
        time_sampled = element.find('time_sampled')
        list_head.append(time_sampled)
        
        air_volume_sampled = element.find('air_volume_sampled')
        list_head.append(air_volume_sampled)
        
        sample_weight = element.find('sample_weight')
        list_head.append(sample_weight)
        
        imis_substance_code = element.find('imis_substance_code')
        list_head.append(imis_substance_code)
        
        substance = element.find('substance')
        list_head.append(substance)
        
        sample_result = element.find('sample_result')
        list_head.append(sample_result)
        
        unit_of_measurement = element.find('unit_of_measurement')
        list_head.append(unit_of_measurement)
        
        qualifier = element.find('qualifier')
        list_head.append(qualifier)

        Csv_writer.writerow(list_head)
        count = +1

    inspection_number = element.find('inspection_number').text
    List_nodes.append(inspection_number)

    establishment_name = element.find('establishment_name').text
    List_nodes.append(establishment_name)

    city = element.find('city').text
    List_nodes.append(city)

    state = element.find('state').text
    List_nodes.append(state)

    zip_code = element.find('zip_code').text
    List_nodes.append(zip_code)    

    sic_code = element.find('sic_code').text
    List_nodes.append(sic_code)

    naics_code = element.find('naics_code').text
    List_nodes.append(naics_code)

    sampling_number = element.find('sampling_number').text
    List_nodes.append(sampling_number)

    office_id = element.find('office_id').text
    List_nodes.append(office_id)

    date_sampled = element.find('date_sampled').text
    List_nodes.append(date_sampled)

    date_reported = element.find('date_reported').text
    List_nodes.append(date_reported)

    eight_hour_twa_calc = element.find('eight_hour_twa_calc').text
    List_nodes.append(eight_hour_twa_calc)    
    
    instrument_type = element.find('instrument_type').text
    List_nodes.append(instrument_type)

    lab_number = element.find('lab_number').text
    List_nodes.append(lab_number)

    field_number = element.find('field_number').text
    List_nodes.append(field_number)

    sample_type = element.find('sample_type').text
    List_nodes.append(sample_type)

    blank_used = element.find('blank_used').text
    List_nodes.append()

    time_sampled = element.find('time_sampled').text
    List_nodes.append(time_sampled)

    air_volume_sampled = element.find('air_volume_sampled').text
    List_nodes.append(air_volume_sampled)    
    
    sample_weight = element.find('sample_weight').text
    List_nodes.append(sample_weight)

    imis_substance_code = element.find('imis_substance_code').text
    List_nodes.append(imis_substance_code)

    substance = element.find('substance').text
    List_nodes.append(substance)

    sample_result = element.find('sample_result').text
    List_nodes.append(sample_result)

    unit_of_measurement = element.find('unit_of_measurement').text 
    List_nodes.append(unit_of_measurement)

    qualifier= element.find('qualifier').text
    List_nodes.append(qualifier)

    Csv_writer.writerow(List_nodes)

xml_data_to_csv.close()

但是当我运行代码时,我得到一个没有任何内容的 CSV。我怀疑这可能与与 XML 关联的 XSD 文件有关,但我不完全确定。

有谁知道这里的问题是什么?

标签: pythonxmlcsv

解决方案


下面的代码是您的代码的“紧凑”版本。

它假定 XML 结构看起来像脚本变量中的那样xml。(基于https://www.osha.gov/opengov/sample_data_2011.zip

此示例代码与您的代码的主要区别在于,我定义了我想要收集一次的字段(请参阅 参考资料FIELDS),并且我在整个脚本中使用了这个定义。

import xml.etree.ElementTree as ET

FIELDS = ['lab_number', 'instrument_type']  # TODO add more fields

xml = '''<main xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="health_sample_data.xsd">
  <DATA_RECORD>
    <inspection_number>316180165</inspection_number>
    <establishment_name>PROFESSIONAL ENGINEERING SERVICES, LLC.</establishment_name>
    <city>EUFAULA</city>
    <state>AL</state>
    <zip_code>36027</zip_code>
    <sic_code>1799</sic_code>
    <naics_code>238990</naics_code>
    <sampling_number>434866166</sampling_number>
    <office_id>418600</office_id>
    <date_sampled>2011-12-30</date_sampled>
    <date_reported>2011-12-30</date_reported>
    <eight_hour_twa_calc>N</eight_hour_twa_calc>
    <instrument_type>TBD</instrument_type>
    <lab_number>L13645</lab_number>
    <field_number>S1</field_number>
    <sample_type>B</sample_type>
    <blank_used>N</blank_used>
    <time_sampled></time_sampled>
    <air_volume_sampled></air_volume_sampled>
    <sample_weight></sample_weight>
    <imis_substance_code>S777</imis_substance_code>
    <substance>Soil</substance>
    <sample_result>0</sample_result>
    <unit_of_measurement>AAAAA</unit_of_measurement>
    <qualifier></qualifier>
  </DATA_RECORD>
  <DATA_RECORD>
    <inspection_number>315516757</inspection_number>
    <establishment_name>MARGUERITE CONCRETE CO.</establishment_name>
    <city>WORCESTER</city>
    <state>MA</state>
    <zip_code>1608</zip_code>
    <sic_code>1771</sic_code>
    <naics_code>238110</naics_code>
    <sampling_number>423259902</sampling_number>
    <office_id>112600</office_id>
    <date_sampled>2011-12-30</date_sampled>
    <date_reported>2011-12-30</date_reported>
    <eight_hour_twa_calc>N</eight_hour_twa_calc>
    <instrument_type>GRAV</instrument_type>
    <lab_number>L13355</lab_number>
    <field_number>9831B</field_number>
    <sample_type>P</sample_type>
    <blank_used>N</blank_used>
    <time_sampled>184</time_sampled>
    <air_volume_sampled>340.4</air_volume_sampled>
    <sample_weight>.06</sample_weight>
    <imis_substance_code>9135</imis_substance_code>
    <substance>Particulates not otherwise regulated (Total Dust)</substance>
    <sample_result>0.176</sample_result>
    <unit_of_measurement>M</unit_of_measurement>
    <qualifier></qualifier>
  </DATA_RECORD></main>'''

root = ET.fromstring(xml)
records = root.findall('.//DATA_RECORD')
with open('out.csv', 'w') as out:
    out.write(','.join(FIELDS) + '\n')
    for record in records:
        values = [record.find(f).text for f in FIELDS]
        out.write(','.join(values) + '\n')

出.csv

lab_number,instrument_type
L13645,TBD
L13355,GRAV

推荐阅读