首页 > 解决方案 > 在 R 数据框中导入包含多个表的 xml 文件

问题描述

我有一个包含更多表的 xml 文件,我想在我的数据框中导入它。

我使用以下代码我的数据名中只有第一个表,其他表在一个列中具有所有值。我不能将所有节点放在单个列中。节点 Instrument_Ratings 的值都在我的数据框中的一列中

有方法吗?

doc <- xmlParse("file.xml")
df <- xmlToDataFrame(doc, stringsAsFactors = FALSE)
<Instrument_Roots xmlns="http://www.moodys.com/fdp" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" file_type="Delta" frequency="24Hour" generation_time="2020-02-18T12:00:00Z" batch_number="000018" batch_date="2020-02-18Z">
<Instrument_Root action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Deal_number indicator="x" xsi:nil="true"/>
<Class_Code indicator="x">834</Class_Code>
<Class_Text indicator="x">Regular Bond/Debenture</Class_Text>
<Class_Short_Description indicator="x">REG</Class_Short_Description>
<Dated_Date indicator="x" xsi:nil="true"/>
<ISO_Currency_Code indicator="x">USD</ISO_Currency_Code>
<Currency_Multiple_Indicator indicator="x">N</Currency_Multiple_Indicator>
<Maturity_Date indicator="x">2021-02-09T00:00:00</Maturity_Date>
<Maturity_Year indicator="x">2021</Maturity_Year>
<Sale_Date indicator="x">2020-02-04T00:00:00</Sale_Date>
<Face_Amount_USD indicator="x">1.0000000000</Face_Amount_USD>
<Credit_Linked_Indicator indicator="x">N</Credit_Linked_Indicator>
<Takedown_Indicator indicator="x">N</Takedown_Indicator>
<Security_Description indicator="x" xsi:nil="true"/>
<Instrument_Type_Code indicator="x">24021</Instrument_Type_Code>
<Instrument_Type_Text indicator="x">INDEX LINKED EURO MTNS</Instrument_Type_Text>
<Private_Placement_Code indicator="x">24922</Private_Placement_Code>
<Private_Placement_Text indicator="x">Not Applicable</Private_Placement_Text>
<Coupon_Type_Code indicator="x">20434</Coupon_Type_Code>
<Coupon_Type_Text indicator="x">Non Interest Bearing</Coupon_Type_Text>
<Coupon_Type_Short_Description indicator="x">NIB</Coupon_Type_Short_Description>
<Coupon_Frequency_Code indicator="x">17</Coupon_Frequency_Code>
<Coupon_Frequency_Text indicator="x">Not Applicable</Coupon_Frequency_Text>
<Coupon_Frequency_Short_Description indicator="x">NA</Coupon_Frequency_Short_Description>
<Coupon_Rate indicator="x" xsi:nil="true"/>
<Instrument_Description indicator="x" xsi:nil="true"/>
<Product_Line_Description indicator="x" xsi:nil="true"/>
<Series_Class_Text indicator="x">Ser. 53156</Series_Class_Text>
<Market_Description indicator="x">EUROMARKET</Market_Description>
<Face_Amount indicator="x">1.0000000000</Face_Amount>
<Structured_Finance_Indicator indicator="x">N</Structured_Finance_Indicator>
<Structured_Finance_Sequence_Number indicator="x" xsi:nil="true"/>
<Instrument_Ratings>
<Instrument_Rating action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Rating_Class_Number indicator="x">46612</Rating_Class_Number>
<Rating_Date indicator="x">2020-02-17T11:13:23</Rating_Date>
<Moodys_Rating_ID indicator="x">831263727</Moodys_Rating_ID>
<Rating_Level indicator="x">I</Rating_Level>
<Rating_Class_Text indicator="x">Senior Unsecured</Rating_Class_Text>
<Security_Class_Code indicator="x">834</Security_Class_Code>
<Security_Class_Text indicator="x">Regular Bond/Debenture</Security_Class_Text>
<Security_Class_Short_Description indicator="x">REG</Security_Class_Short_Description>
<Duration_Code indicator="x">25636</Duration_Code>
<Duration_Text indicator="x">Long-Term Debt Rating</Duration_Text>
<Duration_Short_Description indicator="x">LT</Duration_Short_Description>
<Seniority_Code indicator="x">18744</Seniority_Code>
<Seniority_Text indicator="x">Senior Unsecured</Seniority_Text>
<Seniority_Short_Description indicator="x">SU</Seniority_Short_Description>
<Evaluation_Type_Code indicator="x">25648</Evaluation_Type_Code>
<Evaluation_Type_Text indicator="x">Credit Risk</Evaluation_Type_Text>
<Shadow_Code indicator="x">19139</Shadow_Code>
<Shadow_Text indicator="x">Corporate Finance Regular</Shadow_Text>
<Shadow_Short_Description indicator="x">CFR</Shadow_Short_Description>
<Rating_Subclass_Code indicator="x" xsi:nil="true"/>
<Rating_Subclass_Text indicator="x" xsi:nil="true"/>
<Currency_Capd_Code indicator="x">19141</Currency_Capd_Code>
<Currency_Capd_Text indicator="x">Foreign Currency</Currency_Capd_Text>
<Rating_Text indicator="x">NR</Rating_Text>
<Credit_Grade indicator="x" xsi:nil="true"/>
<Rating_Rank indicator="x">0</Rating_Rank>
<Rating_Direction_Code indicator="x">19102</Rating_Direction_Code>
<Rating_Direction_Text indicator="x">DECISION NOT TO RATE</Rating_Direction_Text>
<Rating_Direction_Short_Description indicator="x">NR</Rating_Direction_Short_Description>
<Rating_Type_Code indicator="x">534</Rating_Type_Code>
<Rating_Type_Text indicator="x">Long-Term Debt Rating</Rating_Type_Text>
<Rating_Type_Short_Description indicator="x">LT</Rating_Type_Short_Description>
<Rating_Enhancement_Level indicator="x">UND</Rating_Enhancement_Level>
<Rating_Local_Date indicator="x">2020-02-17T11:13:23</Rating_Local_Date>
<Rating_Termination_Date indicator="x" xsi:nil="true"/>
<Rating_Termination_Local_Date indicator="x" xsi:nil="true"/>
<Rating_Reason_Code indicator="x">25530</Rating_Reason_Code>
<Rating_Reason_Text indicator="x">DECISION NOT TO RATE</Rating_Reason_Text>
<Rating_Currency_Code indicator="x">30</Rating_Currency_Code>
<Rating_Currency_Text indicator="x">U.S. Dollar</Rating_Currency_Text>
<Rating_Currency_ISO_Code indicator="x">USD</Rating_Currency_ISO_Code>
<Rating_Monitor_Indicator indicator="x">1</Rating_Monitor_Indicator>
<Initial_Rating_Indicator indicator="x">Y</Initial_Rating_Indicator>
<Instrument_Watchlist xsi:nil="true"/>
<Instrument_Rating_Attributes>
<Instrument_Rating_Attribute action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Rating_Attribute_Type_Code indicator="x">5159697</Rating_Attribute_Type_Code>
<Rating_Attribute_Type_Text indicator="x">Rating Office</Rating_Attribute_Type_Text>
<Rating_Class_Number indicator="x">46612</Rating_Class_Number>
<Moodys_Rating_ID indicator="x">831263727</Moodys_Rating_ID>
<Effective_Date indicator="x">2020-02-17T11:13:23</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
<Rating_Attribute_Code indicator="x">5159717</Rating_Attribute_Code>
<Rating_Attribute_Text indicator="x">Toronto - Moody's Canada Inc.</Rating_Attribute_Text>
</Instrument_Rating_Attribute>
</Instrument_Rating_Attributes>
</Instrument_Rating>
</Instrument_Ratings>
<Instrument_Supports xsi:nil="true"/>
<Instrument_Organizations>
<Instrument_Organization action="New">
<Organization_ID indicator="x">600008042</Organization_ID>
<Moodys_Legal_Name indicator="x">ROYAL BANK OF CANADA (LONDON BRANCH)</Moodys_Legal_Name>
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Organization_Role_Code indicator="x">129</Organization_Role_Code>
<Organization_Role_Text indicator="x">Issuer</Organization_Role_Text>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Organization>
<Instrument_Organization action="New">
<Organization_ID indicator="x">600022460</Organization_ID>
<Moodys_Legal_Name indicator="x">RBC EUROPE LIMITED</Moodys_Legal_Name>
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Organization_Role_Code indicator="x">25005</Organization_Role_Code>
<Organization_Role_Text indicator="x">Dealer</Organization_Role_Text>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Organization>
</Instrument_Organizations>
<Instrument_Identifiers>
<Instrument_Identifier action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<ID_Type_Code indicator="x">109</ID_Type_Code>
<ID_Type_Text indicator="x">ISIN</ID_Type_Text>
<ID_Type_Short_Description indicator="x">ISI</ID_Type_Short_Description>
<Instrument_ID_Value indicator="x">XS2097291137</Instrument_ID_Value>
</Instrument_Identifier>
<Instrument_Identifier action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<ID_Type_Code indicator="x">2002</ID_Type_Code>
<ID_Type_Text indicator="x">Bloomberg Global Identifier</ID_Type_Text>
<ID_Type_Short_Description indicator="x">BBG ID</ID_Type_Short_Description>
<Instrument_ID_Value indicator="x">BBG00RHP5JH1</Instrument_ID_Value>
</Instrument_Identifier>
</Instrument_Identifiers>
<Instrument_Attributes xsi:nil="true"/>
<Instrument_Markets>
<Instrument_Market action="New">
<Instrument_ID indicator="x">831260145</Instrument_ID>
<Domain_Number indicator="x">806356849</Domain_Number>
<Domain_Name indicator="x">EUROMARKET</Domain_Name>
<Effective_Date indicator="x">2020-02-03T00:00:00</Effective_Date>
<Termination_Date indicator="x" xsi:nil="true"/>
</Instrument_Market>
</Instrument_Markets>
</Instrument_Root>

标签: rxmldataframetidyversexml2

解决方案


xmlToDataFrame中,指定要提取的节点级别getNodeSet。但是,由于您在 XML 中有一个未声明的命名空间前缀xmlns="http://www.moodys.com/fdp",因此您必须使用命名字符向量为 R 中的 XPath 解析定义一个命名空间前缀。

然后,您可以从那里将所有重复部分提取到不同的数据框中。下面cbind是每个节点集合 ( Instrument_Rating_Attribute, Instrument_Organization, Instrument_Identifiers,Instrument_Markets以及父根信息,每个都具有不同数量的观察结果。

library(XML)

doc <- xmlParse("Input.xml")
nmsp <- c(doc="http://www.moodys.com/fdp")

Instrument_Ratings

### Instrument_Ratings
ratings_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                    xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Rating", namespaces=nmsp), stringsAsFactors = FALSE),
                    xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Rating_Attribute", namespaces=nmsp), stringsAsFactors = FALSE)
                    )

### CLEAN DUPLICATE NAMES
table(names(ratings_df))[table(names(ratings_df)) > 1]
# Instrument_ID    Moodys_Rating_ID Rating_Class_Number 
#             3                   2                   2 

names(ratings_df)[grep("Instrument_ID|Moodys_Rating_ID|Rating_Class_Number", names(ratings_df))]
# [1] "Instrument_ID"  "Instrument_ID" "Rating_Class_Number" "Moodys_Rating_ID" "Instrument_ID" "Rating_Class_Number" "Moodys_Rating_ID" 
names(ratings_df)[grep("Instrument_ID|Moodys_Rating_ID|Rating_Class_Number", names(ratings_df))] <-
    c("Instrument_ID_1", "Instrument_ID_2", "Rating_Class_Number_1", "Moodys_Rating_ID_1",
      "Instrument_ID_3", "Rating_Class_Number_2", "Moodys_Rating_ID_2")

### REMOVE OVERALPS
ratings_df <- within(ratings_df, {
    rm(Instrument_Ratings,
       Instrument_Organizations,
       Instrument_Identifiers,
       Instrument_Markets,
       Instrument_Rating_Attributes
    )
})

str(ratings_df)
# 'data.frame': 1 obs. of  89 variables:
# $ Instrument_ID_1                   : chr "831260145"
# $ Deal_number                       : chr ""
# $ Class_Code                        : chr "834"
# ...

仪器_组织

orgs_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                 xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Organization", namespaces=nmsp), stringsAsFactors = FALSE)
)

### CLEAN DUPLICATE NAMES
table(names(orgs_df))[table(names(orgs_df)) > 1]
#Instrument_ID 
#            2

names(orgs_df)[grep("Instrument_ID$", names(orgs_df))] <- c("Instrument_ID_1", "Instrument_ID_2")

### REMOVE OVERALPS
orgs_df <- within(orgs_df, {
  rm(Instrument_Ratings, Instrument_Organizations, Instrument_Identifiers, Instrument_Markets)
})

str(orgs_df)
# 'data.frame': 2 obs. of  42 variables:
# $ Instrument_ID_1                   : chr  "831260145" "831260145"
# $ Deal_number                       : chr  "" ""
# $ Class_Code                        : chr  "834" "834"
# ...

Instrument_Identifiers

ids_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Identifier", namespaces=nmsp), stringsAsFactors = FALSE)
)

### CLEAN DUPLICATE NAMES
table(names(ids_df))[table(names(ids_df)) > 1]
#Instrument_ID 
#            2

names(ids_df)[grep("Instrument_ID$", names(ids_df))] <- c("Instrument_ID_1", "Instrument_ID_2")

### REMOVE OVERALPS
ids_df <- within(ids_df, {
  rm(Instrument_Ratings, Instrument_Identifiers, Instrument_Markets)
})

str(ids_df)
# 'data.frame': 2 obs. of  41 variables:
# $ Instrument_ID_1                   : chr  "831260145" "831260145"
# $ Deal_number                       : chr  "" ""
# $ Class_Code                        : chr  "834" "834"
# ...

仪器_市场

mkts_df <- cbind(xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Root", namespaces=nmsp), stringsAsFactors = FALSE),
                 xmlToDataFrame(doc, nodes=getNodeSet(doc, "//doc:Instrument_Market", namespaces=nmsp), stringsAsFactors = FALSE)
)

### CLEAN DUPLICATE NAMES
table(names(mkts_df))[table(names(mkts_df)) > 1]
#Instrument_ID 
#            2

names(mkts_df)[grep("Instrument_ID$", names(mkts_df))] <- c("Instrument_ID_1", "Instrument_ID_2")

### REMOVE OVERALPS
mkts_df <- within(mkts_df, {
  rm(Instrument_Ratings, Instrument_Markets)
})

str(mkts_df)
# 'data.frame': 1 obs. of  42 variables:
# $ Instrument_ID_1                   : chr "831260145"
# $ Deal_number                       : chr ""
# $ Class_Code                        : chr "834"

推荐阅读