首页 > 解决方案 > 如何在R中的变量组合中获得分布

问题描述

我有以下DF:

df <- structure(list(ID = c("IR-1", "IR-2", "IR-3", "IR-4", "IR-4", 
"IR-4", "IR-5", "IR-5", "IR-5", "IR-6", "IR-7", "IR-8", "IR-9", 
"IR-10"), Var1 = c("A1", "", "A2", "A1", "A2", "A3", "A1", "A2", 
"A3", "", "A2", "", "A2", ""), Var2 = c("X1,X2,X3", "", "X1,X4", 
"X1,X2,X3", "X1,X2,X3", "X1,X2,X3", "X1,X3", "X1,X3", "X1,X3", 
"", "X1,X2,X3", "X1,X2,X3", "X8", ""), Type = c("New", "Old", 
"New", "New", "New", "New", "New", "New", "New", "New", "New", 
"New", "New", "Old")), class = "data.frame", row.names = c(NA, 
-14L))

> df
ID       Var1         Var2              Type
IR-1     A1           X1,X2,X3          New
IR-2                                    Old
IR-3     A2           X1,X4             New
IR-4     A1           X1,X2,X3          New
IR-4     A3           X1,X2,X3          New
IR-4     A2           X1,X2,X3          New
IR-5     A1           X1,X3             New
IR-5     A2           X1,X3             New
IR-5     A3           X1,X3             New
IR-6                                    New
IR-7     A2           X1,X2,X3          New
IR-8                  X1,X2,X3          New
IR-9     A2           X8                New
IR-10                                   Old

通过使用df,我想了解 cross 的分布Var1Var1结合Var2为一个独特的ID.

在哪里,

所需数据<-

Variables   Excl_Count   %         A1   A2   A3   X1   X2   X3   X4   GT   XN   XP   X8   KP   KL  
Total       10           100.00%   2    4    2    6    4    5    1    0    0    0    1    0    0
Blank_Var1  4             40.00%   0    0    0    1    1    1    0    0    0    0    0    0    0
Blank_Var2  3             30.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
Blank_Both  3             30.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
Blank_New   1             33.33%   0    0    0    0    0    0    0    0    0    0    0    0    0
Blank_Old   2             66.66%   0    0    0    0    0    0    0    0    0    0    0    0    0
Non_Blank   7             70.00%   3    5    2    6    4    5    1    0    0    0    1    0    0
A1          1             16.67%   0    2    2    3    2    3    0    0    0    0    0    0    0
A2          3             50.00%   2    0    2    4    2    3    1    0    0    0    1    0    0
A3          0              0.00%   2    2    0    2    1    2    0    0    0    0    0    0    0
X1          0              0.00%   3    5    2    0    4    5    1    0    0    0    0    0    0
X2          0              0.00%   2    2    2    5    0    4    0    0    0    0    0    0    0
X3          0              0.00%   3    3    2    5    4    0    0    0    0    0    0    0    0
X4          0              0.00%   0    1    0    1    0    0    0    0    0    0    0    0    0
GT          0              0.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
XN          0              0.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
XP          0              0.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
X8          1             16.67%   0    1    0    0    0    0    0    0    0    0    0    0    0
KP          0              0.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
KL          0              0.00%   0    0    0    0    0    0    0    0    0    0    0    0    0
      

标签: rdataframedplyrtidyversetidyr

解决方案


我不确定第 8 行到最后一行、第 2 列和第 3 列的值在您的示例输出中是什么意思。但是,对于其余代码,您可以按以下方式进行。该代码基于您的另一个问题

df <- structure(list(ID = c("IR-1", "IR-2", "IR-3", "IR-4", "IR-4", 
                            "IR-4", "IR-5", "IR-5", "IR-5", "IR-6", "IR-7", "IR-8", "IR-9", 
                            "IR-10"), Var1 = c("A1", "", "A2", "A1", "A2", "A3", "A1", "A2", 
                                               "A3", "", "A2", "", "A2", ""), Var2 = c("X1,X2,X3", "", "X1,X4", 
                                                                                       "X1,X2,X3", "X1,X2,X3", "X1,X2,X3", "X1,X3", "X1,X3", "X1,X3", 
                                                                                       "", "X1,X2,X3", "X1,X2,X3", "X8", ""), Type = c("New", "Old", 
                                                                                                                                       "New", "New", "New", "New", "New", "New", "New", "New", "New", 
                                                                                                                                       "New", "New", "Old")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                  -14L))
library(tidyverse)
#first dataobject
df1 <- df %>% 
  group_by(Var1) %>%
  mutate(Total = n_distinct(ID),
         Blank_var1 = n_distinct(ID[is.na(Var1) | Var1 == "" | Var1 == "0"]),
         Blank_var2 = n_distinct(ID[is.na(Var2) | Var2 == "" | Var2 == "0"]),
         Blank_Both = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0")]),
         Blank_new = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "New")]),
         Blank_old = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "Old")]),
         non_blank = Total - Blank_Both) %>%
  select(-c(ID, Var2, Type)) %>%
  filter(!(is.na(Var1) | Var1 == "" | Var1 == "0")) %>%
  pivot_longer(-Var1) %>%
  pivot_wider(id_cols = name, names_from = Var1, values_from = "value", values_fn = min) %>%
  ungroup()

#second object
df2 <- df %>% separate(Var2, into = paste0("Var2", 1:3), sep = ",") %>%
  pivot_longer(cols = c(Var21, Var22, Var23), names_to = "name", values_to = "Var2") %>%
  select(-name) %>%
  filter(!(is.na(Var2) | Var2 == "")) %>%
  group_by(Var2) %>%
  mutate(Total = n_distinct(ID),
         Blank_var1 = n_distinct(ID[is.na(Var1) | Var1 == "" | Var1 == "0"]),
         Blank_var2 = n_distinct(ID[is.na(Var2) | Var2 == "" | Var2 == "0"]),
         Blank_Both = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0")]),
         Blank_new = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "New")]),
         Blank_old = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "Old")]),
         non_blank = Total - Blank_Both) %>%
  select(-c(ID, Var1, Type)) %>%
  pivot_longer(-Var2) %>%
  pivot_wider(id_cols = name, names_from = Var2, values_from = "value", values_fn = min)

#third object
df3 <- df %>%
  summarise(Total = n_distinct(ID),
            Blank_var1 = n_distinct(ID[is.na(Var1) | Var1 == "" | Var1 == "0"]),
            Blank_var2 = n_distinct(ID[is.na(Var2) | Var2 == "" | Var2 == "0"]),
            Blank_Both = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0")]),
            Blank_new = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "New")]),
            Blank_old = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "Old")]),
            non_blank = Total - Blank_Both) %>% pivot_longer(cols = 1:7, names_to = "Variable", values_to = "Excl_count") %>%
  mutate(`%` = case_when(Variable == "Total" ~ "100.00%",
                         Variable %in% c("Blank_var1", "Blank_var2", "Blank_Both", "non_blank") ~ paste0(round(Excl_count*100/Excl_count[Variable == "Total"], 2), "%"),
                         Variable == "Blank_new" | Variable == "Blank_old" ~ paste0(round(Excl_count*100/Excl_count[Variable == "Blank_Both"], 2), "%")))


#max values in Var2
n1 <- 1 + max(str_count(df$Var2, ","))
#Tidy data
df_tidy <- df %>% separate(Var2, into = paste0("Var2", 1:n1)) %>%
  pivot_longer(cols = -c(ID, Type), names_to = "Variable") %>%
  mutate(Variable = if_else(str_detect(Variable, "Var2"), "Var2", Variable)) %>%
  filter(!(is.na(value) | value == ""))

#fourth data object
df4 <- merge(df_tidy, df_tidy, by = c("ID", "Type"), all = T) %>%
  filter(! value.x == value.y) %>%
  select(ID, Variable = value.x, value.y) %>%
  group_by(Variable, value.y) %>%
  mutate(val = n_distinct(ID)) %>%
  select(-ID) %>%
  pivot_wider(id_cols = Variable, names_from = value.y, values_from = val, values_fn = min, values_fill = 0) %>%
  mutate(Excl_count = NA,
         `%` = NA)
  


#final
rbind(merge(df3, merge(df1, df2, by.x = "name", by.y = "name", sort = F), 
      by.x = "Variable", by.y = "name", sort = F), df4)

     Variable Excl_count       % A1 A2 A3 X1 X2 X3 X4 X8
1       Total         10 100.00%  3  5  2  6  4  5  1  1
2  Blank_var1          4     40%  0  0  0  1  1  1  0  0
3  Blank_var2          3     30%  0  0  0  0  0  0  0  0
4  Blank_Both          3     30%  0  0  0  0  0  0  0  0
5   Blank_new          1  33.33%  0  0  0  0  0  0  0  0
6   Blank_old          2  66.67%  0  0  0  0  0  0  0  0
7   non_blank          7     70%  3  5  2  6  4  5  1  1
8          A1         NA    <NA>  0  2  2  3  2  3  0  0
9          X1         NA    <NA>  3  4  2  0  4  5  1  0
10         X2         NA    <NA>  2  2  1  4  0  4  0  0
11         X3         NA    <NA>  3  3  2  5  4  0  0  0
12         A2         NA    <NA>  2  0  2  4  2  3  1  1
13         X4         NA    <NA>  0  1  0  1  0  0  0  0
14         A3         NA    <NA>  2  2  0  2  1  2  0  0
15         X8         NA    <NA>  0  1  0  0  0  0  0  0

推荐阅读