首页 > 解决方案 > 以 SDMX 格式循环 URL 并将结果合并到 R 数据帧中

问题描述

我正在尝试下载世界银行的 WITS 数据并将它们转换为 R 数据框。该网站的API(参考第 12-13 页)似乎不允许用户一次调用所有“记者”、“合作伙伴”和“产品”,因此可能需要遍历国家列表(或者“记者”或“合作伙伴”)使用他们的 XML 请求格式:

http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG

(需要通过“reporters”(国家/地区缩写)列表遍历“usa”部分)我的目标是遍历一个国家/地区缩写列表,为每次运行生成一个数据框,然后将它们绑定到一个更大的数据框。所以我参考了这篇文章并使用了下面的代码,但它并没有让我更进一步。我在下面发布了我的代码,如果有人可以查看并分享一些关于此的提示,我将不胜感激。

# load required packages
library(RCurl)
library(XML)
devtools::install_github("opensdmx/rsdmx")
library(rsdmx)

# if just for one reporter (usa)
myUrl <- "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/usa/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"

dataset <- readSDMX(myUrl)
stats <- as.data.frame(dataset)

dim(stats)
[1] 5142    8

# looks like this
head(stats)
  FREQ REPORTER PARTNER     PRODUCTCODE
1    A      USA     ABW    01-05_Animal
2    A      USA     ABW 06-15_Vegetable
3    A      USA     ABW  16-24_FoodProd

## loop through a list of reporters (countries)

library(rvest)

# teams
reporters <- c("aus", "usa", "ukr")

# init
df <- data.frame()

# loop
for(i in reporters){
    # find url
    myUrl <- paste0("http://wits.worldbank.org/API/V1/SDMX/V21/datasource/tradestats-tariff/reporter/", i,"/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG")
    dataset <- readSDMX(myUrl)
    stats <- as.data.frame(dataset)
    # bind to dataframe
    df <- rbind(df, stats)
}

# view captured data
View(df)

# NOTHING!

标签: rxmlloopssdmx

解决方案


考虑 R 的内置utils::download.file(),然后用XML. 由于您的数据以属性为中心,<Series>并且<Obs>节点中没有元素文本,因此请考虑未记录的xmlAttrsToDataFrame,需要三个冒号限定符:::.

最后,使用类似的 apply 函数,并通过在同一数据帧上迭代调用sapply来避免循环的簿记和在for循环中低效地增长对象。rbind下面甚至包含了下载和 XML 解析,tryCatch以解决潜在的错误,例如ukr.

library(XML)

build_df <- function(i) {
  url <- paste0(
    "http://wits.worldbank.org/API/V1/SDMX/V21/datasource/",
    "tradestats-tariff/reporter/", i,
    "/year/2000/partner/all/product/all/indicator/AHS-WGHTD-AVRG"
  )
  
  df <- NULL
  tryCatch({
    tmp <- tempfile()          # DOWNLOAD XML TO TEMP FILE
    download.file(url, tmp)
    
    # PARSE XML AND EXTRACT ATTRIBUTES TO DATA FRAME
    doc <- XML::xmlParse(tmp)
    df <- cbind(
      XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Series")),
      XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//Obs"))
    )
    unlink(tmp)               # DELETE TEMP FILE
  }, warning = function(w) print(w)
  , error = function(e) print(e)
  )
  
  return(df)
}

reporters <- c("aus", "usa", "ukr")

# NAMED LIST OF DATA FRAMES
df_list <- sapply(reporters, build_df)

# COMPILE ALL INTO SINGLE DATA FRAME (CALL rbind ONCE)
final_df <- do.call(rbind, unname(df_list))

输出

df_list

head(df_list$aus)
  FREQ REPORTER PARTNER     PRODUCTCODE      INDICATOR TIME_PERIOD        OBS_VALUE DATASOURCE
1    A      AUS     AFG 50-63_TextCloth AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
2    A      AUS     AFG           manuf AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
3    A      AUS     AFG        Textiles AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
4    A      AUS     AFG           Total AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
5    A      AUS     AFG     UNCTAD-SoP3 AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
6    A      AUS     AGO 41-43_HidesSkin AHS-WGHTD-AVRG        2000               20   WITS-TRN

head(df_list$usa)
  FREQ REPORTER PARTNER     PRODUCTCODE      INDICATOR TIME_PERIOD OBS_VALUE DATASOURCE
1    A      USA     ABW    01-05_Animal AHS-WGHTD-AVRG        2000         0   WITS-TRN
2    A      USA     ABW 06-15_Vegetable AHS-WGHTD-AVRG        2000         0   WITS-TRN
3    A      USA     ABW  16-24_FoodProd AHS-WGHTD-AVRG        2000         0   WITS-TRN
4    A      USA     ABW     27-27_Fuels AHS-WGHTD-AVRG        2000         0   WITS-TRN
5    A      USA     ABW 28-38_Chemicals AHS-WGHTD-AVRG        2000         0   WITS-TRN
6    A      USA     ABW 39-40_PlastiRub AHS-WGHTD-AVRG        2000         0   WITS-TRN

head(df_list$ukr)
NULL

final_df

head(final_df)
  FREQ REPORTER PARTNER     PRODUCTCODE      INDICATOR TIME_PERIOD        OBS_VALUE DATASOURCE
1    A      AUS     AFG 50-63_TextCloth AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
2    A      AUS     AFG           manuf AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
3    A      AUS     AFG        Textiles AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
4    A      AUS     AFG           Total AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
5    A      AUS     AFG     UNCTAD-SoP3 AHS-WGHTD-AVRG        2000 0.46377738685431   WITS-TRN
6    A      AUS     AGO 41-43_HidesSkin AHS-WGHTD-AVRG        2000               20   WITS-TRN

tail(final_df)
     FREQ REPORTER PARTNER PRODUCTCODE      INDICATOR TIME_PERIOD          OBS_VALUE DATASOURCE
8966    A      USA     ZWE       Total AHS-WGHTD-AVRG        2000   5.65257483778078   WITS-TRN
8967    A      USA     ZWE      Transp AHS-WGHTD-AVRG        2000  0.249632882835876   WITS-TRN
8968    A      USA     ZWE UNCTAD-SoP1 AHS-WGHTD-AVRG        2000   29.5531507778593   WITS-TRN
8969    A      USA     ZWE UNCTAD-SoP2 AHS-WGHTD-AVRG        2000    2.7243509937362   WITS-TRN
8970    A      USA     ZWE UNCTAD-SoP3 AHS-WGHTD-AVRG        2000   5.12801568054237   WITS-TRN
8971    A      USA     ZWE UNCTAD-SoP4 AHS-WGHTD-AVRG        2000 0.0082396181212962   WITS-TRN

推荐阅读