首页 > 解决方案 > 用于在 R 中的数据框中计算和添加新列的函数

问题描述

我有像这样的具有不同行号的数据集。但这项任务对我来说似乎很复杂。我最终得到了空列。

ID <- c("0001", "0002", "0003", "0004", "0008", "0009")
class <- c("0010", "0011", "0100", "0101", "0110", "0111")
user <- c(letters[1:6])
name <- c("A", "B", "C", "D", "E", "F")

df1 <- data.frame(ID, class, name, user)

ID <- c("0001", "0002", "0002", "0003", "0004", "0008", "0010")
class <- c("0010", "0011", "0011", "0100", "0101", "0110", "0112")
name <- c("A", "B", "B", "C", "D", "E", "G")
user <- c(letters[1:7])
df2 <- data.frame(ID, class, name, user)

ID <- c("0001", "0002", "0003", "0004", "0009")
class <- c("0010", "0011", "0100", "0101", "0111")
name <- c("A", "unknown", "C", "unknown", "F")
user <- c(letters[1:5])
df3 <- data.frame(ID, class, name, user)

ID <- c("0001", "0002", "0003", "0004", "0008", "0010")
class <- c("1010", "0011", "0100", "0101", "0110", "0112")
name <- c("A", "b", "C", "unknown", "E", "G")
user <- c(letters[1:6])
df4 <- data.frame(ID, class, name, user)

reprex 包于 2021-07-02 创建 (v2.0.0 )

我想要这样的输出:

ID <- c("0001", "0001", "0002", "0003", "0004", "0008", "0009", "0010")
class <- c("0010", "1010", "0011", "0100", "0101", "0110", "0111", "0112")
name <- c("A", "A", "B", "C", "D", "E", "F", "G")
count_of_ID_class_combination_use <- c(3, 1, 4, 4, 4, 3, 2, 2)
total_df_analyzed <- c(4, 4, 4, 4, 4, 4, 4, 4)
List_of_df_that_use_this <- c("df1_df2_df3", "df4", "df1_df2_df3_df4", "df1_df2_df3_df4", "df1_df2_df3_df4", "df1_df2_df3_df4", "df1_df3", "df2_df4")
Other_names_used <- c("", "", "unknown_b", "", "unknown", "", "", "")
main <- data.frame(ID, class, name, count_of_ID_class_combination_use, total_df_analyzed, List_of_df_that_use_this, Other_names_used)

reprex 包于 2021-07-02 创建 (v2.0.0 )

我想df1与其他dfs进行比较。首先,我想检查是否多次使用了任何ID组合class。我将忽略它们(喜欢00020011组合df2)。然后考虑ID,class和, 我想知道一个特定和组合使用name了多少次、分析的数据框总数、使用该组合的数据框列表以及其他名称(如果相同和组合有多个名称) )。IDclassIDclass

在此先感谢您的帮助。

标签: rlistdataframe

解决方案


可以dplyr这样实现:

library(dplyr)
library(string)

# First: Put the df in one list
df_list <- mget(paste0("df", 1:4))

df_bind <- dplyr::bind_rows(df_list, .id = "df_id")
df_bind %>% 
  # Remove duplicated observations per df
  distinct(ID, class, name, df_id) %>% 
  mutate(total_df_analyzed = n_distinct(df_id)) %>% 
  group_by(ID, class) %>% 
  # Summary stats per ID and class
  mutate(count_of_ID_class_combination_use = n(),
         List_of_df_that_use_this = paste(df_id, collapse = "_"),
         Other_names_used = paste(unique(name), collapse = "_")) %>% 
  ungroup() %>% 
  # Keep only one row per ID and class
  distinct(ID, class, .keep_all = TRUE) %>% 
  mutate(Other_names_used = stringr::str_remove(Other_names_used, paste0(name, "(_)?"))) %>% 
  select(ID, class, name, count_of_ID_class_combination_use, total_df_analyzed, List_of_df_that_use_this,Other_names_used) %>% 
  arrange(ID, class) %>% 
  as.data.frame()
#>     ID class name count_of_ID_class_combination_use total_df_analyzed
#> 1 0001  0010    A                                 3                 4
#> 2 0001  1010    A                                 1                 4
#> 3 0002  0011    B                                 4                 4
#> 4 0003  0100    C                                 4                 4
#> 5 0004  0101    D                                 4                 4
#> 6 0008  0110    E                                 3                 4
#> 7 0009  0111    F                                 2                 4
#> 8 0010  0112    G                                 2                 4
#>   List_of_df_that_use_this Other_names_used
#> 1              df1_df2_df3                 
#> 2                      df4                 
#> 3          df1_df2_df3_df4        unknown_b
#> 4          df1_df2_df3_df4                 
#> 5          df1_df2_df3_df4          unknown
#> 6              df1_df2_df4                 
#> 7                  df1_df3                 
#> 8                  df2_df4

推荐阅读