首页 > 解决方案 > Python中的XML到数据库

问题描述

我有数千个 XML 文件

它有许多标签和许多属性。

我想把数据放在 SQL Server 表中。

表的结构将是这样的

DocumentPath varchar(1000)
Tag          varchar(1000)
ID           varchar(1000)
Attribute    varchar(1000)
Value        varchar(1000)

XML 文件看起来像这样

<?xml version="1.0" encoding="UTF-8"?>
<xmi:XMI xmlns:xmi="http://www.omg.org/XMI" xmlns:util="http:///org/" xmi:version="2.0">
    <cas:NULL xmi:id="0"/>
    <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="5769" language="x-unspecified"/>
    <structured:DocumentID xmi:id="13" documentID="5"/>
    <structured:DocumentIdPrefix xmi:id="15" documentIdPrefix=""/>
    <structured:DocumentPath xmi:id="35" documentPath="J:\T9.xmi"/>
    <textspan:Segment xmi:id="37" sofa="1" begin="0" end="5769" id="SIMPLE_SEGMENT" preferredText="SIMPLE_SEGMENT"/>
    <textspan:Sentence xmi:id="44" sofa="1" begin="0" end="15" sentenceNumber="0"/>
    <textspan:Sentence xmi:id="50" sofa="1" begin="17" end="32" sentenceNumber="1"/>
    <textspan:Sentence xmi:id="56" sofa="1" begin="37" end="52" sentenceNumber="2"/>
    <syntax:SymbolToken xmi:id="2242" sofa="1" begin="18" end="19" tokenNumber="4"/>
    <syntax:SymbolToken xmi:id="2250" sofa="1" begin="19" end="20" tokenNumber="5"/>
    <syntax:SymbolToken xmi:id="2301" sofa="1" begin="29" end="30" tokenNumber="11"/>
    <syntax:SymbolToken xmi:id="2309" sofa="1" begin="30" end="31" tokenNumber="12"/>
    <syntax:NumToken xmi:id="2258" sofa="1" begin="20" end="24" tokenNumber="6" numType="1"/>
    <syntax:NumToken xmi:id="2275" sofa="1" begin="25" end="26" tokenNumber="8" numType="1"/>
    <syntax:NumToken xmi:id="2292" sofa="1" begin="27" end="29" tokenNumber="10" numType="1"/>
    <syntax:NumToken xmi:id="2381" sofa="1" begin="57" end="61" tokenNumber="20" numType="1"/>
    <textsem:MedicationMention xmi:id="18523" sofa="1" begin="534" end="540" id="0" ontologyConceptArr="18510" typeID="1" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:MedicationMention xmi:id="16831" sofa="1" begin="543" end="547" id="0" ontologyConceptArr="16818" typeID="1" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:MedicationMention xmi:id="16788" sofa="1" begin="558" end="569" id="0" ontologyConceptArr="16753 16773 16763" typeID="1" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:ProcedureMention xmi:id="22518" sofa="1" begin="328" end="343" id="0" ontologyConceptArr="22505" typeID="5" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:ProcedureMention xmi:id="22682" sofa="1" begin="478" end="490" id="0" ontologyConceptArr="22669" typeID="5" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:ProcedureMention xmi:id="22228" sofa="1" begin="495" end="510" id="0" ontologyConceptArr="22215" typeID="5" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:ProcedureMention xmi:id="21938" sofa="1" begin="794" end="809" id="0" ontologyConceptArr="21925" typeID="5" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:ProcedureMention xmi:id="22868" sofa="1" begin="1057" end="1072" id="0" ontologyConceptArr="22855" typeID="5" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:AnatomicalSiteMention xmi:id="24333" sofa="1" begin="786" end="793" id="0" ontologyConceptArr="24320" typeID="6" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:AnatomicalSiteMention xmi:id="25103" sofa="1" begin="842" end="857" id="0" ontologyConceptArr="25090" typeID="6" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:AnatomicalSiteMention xmi:id="23654" sofa="1" begin="842" end="850" id="0" ontologyConceptArr="23641" typeID="6" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:EventMention xmi:id="25137" sofa="1" begin="117" end="124" id="0" ontologyConceptArr="25124" typeID="1003" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:EventMention xmi:id="25201" sofa="1" begin="3448" end="3458" id="0" ontologyConceptArr="25188" typeID="1003" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <textsem:EventMention xmi:id="25169" sofa="1" begin="3796" end="3803" id="0" ontologyConceptArr="25156" typeID="1003" discoveryTechnique="1" confidence="0.0" polarity="0" uncertainty="0" conditional="false" generic="false" historyOf="0"/>
    <cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text"/>
    <structured:SourceData xmi:id="23" noteTypeCode="ClinicalNote" sourceRevisionNbr="0" sourceRevisionDate="2020-03-02 10:48:30"/>
    <refsem:UmlsConcept xmi:id="18510" codingScheme="SNOMEDCT_US" code="64197008" score="0.0" disambiguated="false" cui="C0037366" tui="T131" preferredText="Smoke"/>
    <refsem:UmlsConcept xmi:id="16818" codingScheme="RXNORM" code="746839" score="0.0" disambiguated="false" cui="C1999262" tui="T122" preferredText="Pack"/>
    <refsem:UmlsConcept xmi:id="16753" codingScheme="SNOMEDCT_US" code="410942007" score="0.0" disambiguated="false" cui="C0013227" tui="T121" preferredText="Pharmaceutical Preparations"/>
    <cas:View sofa="1" members="8 13 15 17 35"/>
</xmi:XMI>

structured:DocumentPath标签的值和documentPath attibuteDocumentPath在表中

Tag在 XML 标记的每一行中,例如textspan:Sentence

IDxmi:id价值

属性是每个属性名称,值是属性值

因此这条线

<textspan:Sentence xmi:id="44" sofa="1" begin="0" end="15" sentenceNumber="0"/>

将像这样在表格中

DocumentPath  Tag                 ID    Attribute         Value        
J:\T9.xmi     textspan:Sentence   44    sofa              1
J:\T9.xmi     textspan:Sentence   44    begin             0
J:\T9.xmi     textspan:Sentence   44    end               15
J:\T9.xmi     textspan:Sentence   44    sentenceNumber    0

我发现这个网站展示了如何读取一个简单的 XML 文件

在此处输入图像描述

但我的情况与每行中的属性数量不同是动态的

标签: pythonsql-serverxmldatabase

解决方案


尝试pip intall untangle安装untangle. 一个示例使用您的输入:

from untangle import parse
your_xml = """.. omit.."""
your_obj = parse(your_xml)
your_obj.xmi_XMI.textspan_Segment._attributes

{'xmi:id': '37',
 'sofa': '1',
 'begin': '0',
 'end': '5769',
 'id': 'SIMPLE_SEGMENT',
 'preferredText': 'SIMPLE_SEGMENT'}

如您所见,所有数据都可以像对象一样访问。


推荐阅读