r - 用于在 R 中的数据框中计算和添加新列的函数
问题描述
我有像这样的具有不同行号的数据集。但这项任务对我来说似乎很复杂。我最终得到了空列。
ID <- c("0001", "0002", "0003", "0004", "0008", "0009")
class <- c("0010", "0011", "0100", "0101", "0110", "0111")
user <- c(letters[1:6])
name <- c("A", "B", "C", "D", "E", "F")
df1 <- data.frame(ID, class, name, user)
ID <- c("0001", "0002", "0002", "0003", "0004", "0008", "0010")
class <- c("0010", "0011", "0011", "0100", "0101", "0110", "0112")
name <- c("A", "B", "B", "C", "D", "E", "G")
user <- c(letters[1:7])
df2 <- data.frame(ID, class, name, user)
ID <- c("0001", "0002", "0003", "0004", "0009")
class <- c("0010", "0011", "0100", "0101", "0111")
name <- c("A", "unknown", "C", "unknown", "F")
user <- c(letters[1:5])
df3 <- data.frame(ID, class, name, user)
ID <- c("0001", "0002", "0003", "0004", "0008", "0010")
class <- c("1010", "0011", "0100", "0101", "0110", "0112")
name <- c("A", "b", "C", "unknown", "E", "G")
user <- c(letters[1:6])
df4 <- data.frame(ID, class, name, user)
由reprex 包于 2021-07-02 创建 (v2.0.0 )
我想要这样的输出:
ID <- c("0001", "0001", "0002", "0003", "0004", "0008", "0009", "0010")
class <- c("0010", "1010", "0011", "0100", "0101", "0110", "0111", "0112")
name <- c("A", "A", "B", "C", "D", "E", "F", "G")
count_of_ID_class_combination_use <- c(3, 1, 4, 4, 4, 3, 2, 2)
total_df_analyzed <- c(4, 4, 4, 4, 4, 4, 4, 4)
List_of_df_that_use_this <- c("df1_df2_df3", "df4", "df1_df2_df3_df4", "df1_df2_df3_df4", "df1_df2_df3_df4", "df1_df2_df3_df4", "df1_df3", "df2_df4")
Other_names_used <- c("", "", "unknown_b", "", "unknown", "", "", "")
main <- data.frame(ID, class, name, count_of_ID_class_combination_use, total_df_analyzed, List_of_df_that_use_this, Other_names_used)
由reprex 包于 2021-07-02 创建 (v2.0.0 )
我想df1
与其他dfs进行比较。首先,我想检查是否多次使用了任何ID
组合class
。我将忽略它们(喜欢0002
和0011
组合df2
)。然后考虑ID
,class
和, 我想知道一个特定和组合使用name
了多少次、分析的数据框总数、使用该组合的数据框列表以及其他名称(如果相同和组合有多个名称) )。ID
class
ID
class
在此先感谢您的帮助。
解决方案
可以dplyr
这样实现:
library(dplyr)
library(string)
# First: Put the df in one list
df_list <- mget(paste0("df", 1:4))
df_bind <- dplyr::bind_rows(df_list, .id = "df_id")
df_bind %>%
# Remove duplicated observations per df
distinct(ID, class, name, df_id) %>%
mutate(total_df_analyzed = n_distinct(df_id)) %>%
group_by(ID, class) %>%
# Summary stats per ID and class
mutate(count_of_ID_class_combination_use = n(),
List_of_df_that_use_this = paste(df_id, collapse = "_"),
Other_names_used = paste(unique(name), collapse = "_")) %>%
ungroup() %>%
# Keep only one row per ID and class
distinct(ID, class, .keep_all = TRUE) %>%
mutate(Other_names_used = stringr::str_remove(Other_names_used, paste0(name, "(_)?"))) %>%
select(ID, class, name, count_of_ID_class_combination_use, total_df_analyzed, List_of_df_that_use_this,Other_names_used) %>%
arrange(ID, class) %>%
as.data.frame()
#> ID class name count_of_ID_class_combination_use total_df_analyzed
#> 1 0001 0010 A 3 4
#> 2 0001 1010 A 1 4
#> 3 0002 0011 B 4 4
#> 4 0003 0100 C 4 4
#> 5 0004 0101 D 4 4
#> 6 0008 0110 E 3 4
#> 7 0009 0111 F 2 4
#> 8 0010 0112 G 2 4
#> List_of_df_that_use_this Other_names_used
#> 1 df1_df2_df3
#> 2 df4
#> 3 df1_df2_df3_df4 unknown_b
#> 4 df1_df2_df3_df4
#> 5 df1_df2_df3_df4 unknown
#> 6 df1_df2_df4
#> 7 df1_df3
#> 8 df2_df4
推荐阅读
- user-interface - 如何在 godbolt.org 保存新 URL?
- html - getURL 不适用于一个链接(以前也有效)
- android - 颤振应用程序中的底部导航栏项目图标
- symfony - 如何在 Symfony 的表单构建器中添加 if
- python - 将更新的模型特征传递给逻辑回归并在测试集上获得预测分数
- google-app-maker - Google App Maker:如何更新一个字段的多行?
- python-3.x - 调度动作
- html - HTML 画布:如何创建一个填充有网格的多边形
- javascript - 多个 jQuery 正则表达式测试
- ios - 在 swift 4 中添加/更新自定义对象字典数组