r - 来自 R API 调用的重复行
问题描述
对于 NPPES API 的每个请求,我都会返回两行,我很难理解为什么。
我可以使用 dplyr 选择不同的,但我宁愿更好地理解为什么我得到重复的行而不是使用解决方法。您的帮助将不胜感激。
我的代码如下所示:
pacman::p_load(tidyverse,httr,jsonlite,purrr)
path <- "https://npiregistry.cms.hhs.gov/api/?"
# CREATE A FUNCTION TO QUERY THE NPPES NPI REGISTRY
getNPI <- function(object) {
request <- httr::GET(url = path,
query = list(version = "2.0",
number = object))
df <- content(request, as = "text", encoding = "UTF-8") %>%
jsonlite::fromJSON(., flatten = TRUE) %>%
data.frame() %>%
tidyr::unnest(c(results.addresses,results.taxonomies), names_repair = "unique")
df_col_names <- names(df)
cols_to_add <- setdiff(c("result_count",
"results.enumeration_type",
"results.number",
"results.last_updated_epoch",
"results.created_epoch",
"results.other_names",
"country_code",
"country_name",
"address_purpose",
"address_type",
"address_1",
"address_2",
"city",
"state",
"postal_code",
"telephone_number",
"code",
"desc",
"primary",
"state1",
"license",
"results.identifiers",
"results.basic.first_name",
"results.basic.last_name",
"results.basic.middle_name",
"results.basic.credential",
"results.basic.sole_proprietor",
"results.basic.gender",
"results.basic.enumeration_date",
"results.basic.last_updated",
"results.basic.status",
"results.basic.name"), df_col_names)
if (length(cols_to_add) > 0) {
for(i in cols_to_add){
df[,i] <- "UNKNOWN"
}
}
# df_col_names <- names(df$results.taxonomies[[1]])
# cols_to_add <- setdiff(c("desc","state"), df_col_names)
# if (length(cols_to_add) > 0) {
# for(i in cols_to_add){
# df$results.taxonomies[[1]][,i] <- "UNKNOWN"
# }
# }
df %>%
select(results.number,
results.basic.name,
results.enumeration_type,
results.basic.gender,
results.basic.credential,
desc) %>%
rename(Provider_NPI = results.number,
Provider_Name = results.basic.name,
Provider_Gender = results.basic.gender,
Provider_Credentials = results.basic.credential,
Provider_Taxonomy = desc) %>%
mutate(
Provider_Type = case_when(
results.enumeration_type == "NPI-1" ~ 'Individual Provider',
results.enumeration_type == "NPI-2" ~ 'Organizational Provider'
)
) %>%
select(-results.enumeration_type)
}
providerIDs <- c(
#'1477765634',
'1376815795'
)
test <- lapply(providerIDs, getNPI)
final_df <- do.call("rbind",test)
解决方案
如果我们一步一步地这样做,“data.frame”的“results.taxonomies”、“results.addresses”分别有 1 行和 2 行
out <- content(request, as = "text", encoding = "UTF-8") %>%
jsonlite::fromJSON(., flatten = TRUE) %>%
data.frame()
str(out)
#...
# $ results.addresses :List of 1
# ..$ :'data.frame': 2 obs. of 10 variables: # 2 row
# .. ..$ country_code : chr "US" "US"
# .. ..$ country_name : chr "United States" "United States"
# .. ..$ address_purpose : chr "LOCATION" "MAILING"
# .. ..$ address_type : chr "DOM" "DOM"
# .. ..$ address_1 : chr "1122 BROADWAY" "1122 BROADWAY"
# .. ..$ address_2 : chr "" ""
# .. ..$ city : chr "WOODMERE" "WOODMERE"
# .. ..$ state : chr "NY" "NY"
# .. ..$ postal_code : chr "115981242" "115981242"
# .. ..$ telephone_number: chr "516-295-3838" "516-295-3838"
# $ results.taxonomies :List of 1
# ..$ :'data.frame': 1 obs. of 5 variables: # one row
# .. ..$ code : chr "363A00000X"
# .. ..$ desc : chr "Physician Assistant"
# .. ..$ primary: logi TRUE
# .. ..$ state : chr "NY"
# .. ..$ license: chr "013595-1"
# ...
在unnest
两个数据集上,它产生两行
out1 <- out %>%
tidyr::unnest(c(results.addresses,results.taxonomies), names_repair = "unique")
dim(out1)
#[1] 2 32
但是,列中存在一些差异,例如
out1 %>%
select(address_purpose)
# A tibble: 2 x 1
# address_purpose
# <chr>
#1 LOCATION
#2 MAILING
推荐阅读
- css - 使用 Tailwind CSS 进行样式设置
- node.js - AWS SAM 部署
- css - 使用新颜色的背景的 CSS 颜色部分
- asp.net-core - 使用 python 请求模块通过自签名 https 证书访问 asp.net 核心 localhost 页面
- r - 将数据框中的所有列配对到 R 中的列表中
- javascript - JS - 从不同文件导入 VAR 后功能不起作用
- c - 用递归的字符串创建一个带有字符的链表?
- javascript - 如果字段有信息,如何显示跨度,否则 React JSX 不显示?
- gitlab-ci - 除非仅更改某个文件夹,否则如何始终运行 Gitlab CI/CD 阶段?
- python - 没有文件更改数据的合并提交无法在数据框中显示