r - 如何在R中的变量组合中获得分布
问题描述
我有以下DF:
df <- structure(list(ID = c("IR-1", "IR-2", "IR-3", "IR-4", "IR-4",
"IR-4", "IR-5", "IR-5", "IR-5", "IR-6", "IR-7", "IR-8", "IR-9",
"IR-10"), Var1 = c("A1", "", "A2", "A1", "A2", "A3", "A1", "A2",
"A3", "", "A2", "", "A2", ""), Var2 = c("X1,X2,X3", "", "X1,X4",
"X1,X2,X3", "X1,X2,X3", "X1,X2,X3", "X1,X3", "X1,X3", "X1,X3",
"", "X1,X2,X3", "X1,X2,X3", "X8", ""), Type = c("New", "Old",
"New", "New", "New", "New", "New", "New", "New", "New", "New",
"New", "New", "Old")), class = "data.frame", row.names = c(NA,
-14L))
> df
ID Var1 Var2 Type
IR-1 A1 X1,X2,X3 New
IR-2 Old
IR-3 A2 X1,X4 New
IR-4 A1 X1,X2,X3 New
IR-4 A3 X1,X2,X3 New
IR-4 A2 X1,X2,X3 New
IR-5 A1 X1,X3 New
IR-5 A2 X1,X3 New
IR-5 A3 X1,X3 New
IR-6 New
IR-7 A2 X1,X2,X3 New
IR-8 X1,X2,X3 New
IR-9 A2 X8 New
IR-10 Old
通过使用df
,我想了解 cross 的分布Var1
并Var1
结合Var2
为一个独特的ID
.
在哪里,
- 总计 =
ID
变量命中的唯一计数和水平行计数(即 var1 和 var2),包括那些ID
- Excl_count = 如果一个特定
ID
的只有 1 个值作为其中一个Var1
或Var2
- Blank_Var1 = Null/NA/Blank 或 0的
ID
唯一计数Var1
- Blank_Var2 = Null/NA/Blank 或 0的
ID
唯一计数Var2
- Blank_Both = 唯一计数,
ID
其中Var1
和Var2
均为 Null/NA/Blank 或 0 - Blank_New = 唯一计数,
ID
其中Var1
和Var2
均为 Null/NA/Blank 或 0 具有Type
= 新 - Blank_Old = 唯一计数,
ID
其中Var1
和Var2
均为 Null/NA/Blank 或 0 具有Type
= 旧 - Non_Blank = 唯一的计数,
ID
其中Var1
或者Var2
不是 Null/NA/Blank 或 0 - 休息是检查跨映射。例如,垂直变量(即 A1 到 A3,以及 X1 到 KL)检查其中有多少只命中而没有命中任何其他值。此外,它还会检查
A1
整个变量列表中哪个其他变量的命中
所需数据<-
Variables Excl_Count % A1 A2 A3 X1 X2 X3 X4 GT XN XP X8 KP KL
Total 10 100.00% 2 4 2 6 4 5 1 0 0 0 1 0 0
Blank_Var1 4 40.00% 0 0 0 1 1 1 0 0 0 0 0 0 0
Blank_Var2 3 30.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
Blank_Both 3 30.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
Blank_New 1 33.33% 0 0 0 0 0 0 0 0 0 0 0 0 0
Blank_Old 2 66.66% 0 0 0 0 0 0 0 0 0 0 0 0 0
Non_Blank 7 70.00% 3 5 2 6 4 5 1 0 0 0 1 0 0
A1 1 16.67% 0 2 2 3 2 3 0 0 0 0 0 0 0
A2 3 50.00% 2 0 2 4 2 3 1 0 0 0 1 0 0
A3 0 0.00% 2 2 0 2 1 2 0 0 0 0 0 0 0
X1 0 0.00% 3 5 2 0 4 5 1 0 0 0 0 0 0
X2 0 0.00% 2 2 2 5 0 4 0 0 0 0 0 0 0
X3 0 0.00% 3 3 2 5 4 0 0 0 0 0 0 0 0
X4 0 0.00% 0 1 0 1 0 0 0 0 0 0 0 0 0
GT 0 0.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
XN 0 0.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
XP 0 0.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
X8 1 16.67% 0 1 0 0 0 0 0 0 0 0 0 0 0
KP 0 0.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
KL 0 0.00% 0 0 0 0 0 0 0 0 0 0 0 0 0
解决方案
我不确定第 8 行到最后一行、第 2 列和第 3 列的值在您的示例输出中是什么意思。但是,对于其余代码,您可以按以下方式进行。该代码基于您的另一个问题
df <- structure(list(ID = c("IR-1", "IR-2", "IR-3", "IR-4", "IR-4",
"IR-4", "IR-5", "IR-5", "IR-5", "IR-6", "IR-7", "IR-8", "IR-9",
"IR-10"), Var1 = c("A1", "", "A2", "A1", "A2", "A3", "A1", "A2",
"A3", "", "A2", "", "A2", ""), Var2 = c("X1,X2,X3", "", "X1,X4",
"X1,X2,X3", "X1,X2,X3", "X1,X2,X3", "X1,X3", "X1,X3", "X1,X3",
"", "X1,X2,X3", "X1,X2,X3", "X8", ""), Type = c("New", "Old",
"New", "New", "New", "New", "New", "New", "New", "New", "New",
"New", "New", "Old")), class = "data.frame", row.names = c(NA,
-14L))
library(tidyverse)
#first dataobject
df1 <- df %>%
group_by(Var1) %>%
mutate(Total = n_distinct(ID),
Blank_var1 = n_distinct(ID[is.na(Var1) | Var1 == "" | Var1 == "0"]),
Blank_var2 = n_distinct(ID[is.na(Var2) | Var2 == "" | Var2 == "0"]),
Blank_Both = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0")]),
Blank_new = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "New")]),
Blank_old = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "Old")]),
non_blank = Total - Blank_Both) %>%
select(-c(ID, Var2, Type)) %>%
filter(!(is.na(Var1) | Var1 == "" | Var1 == "0")) %>%
pivot_longer(-Var1) %>%
pivot_wider(id_cols = name, names_from = Var1, values_from = "value", values_fn = min) %>%
ungroup()
#second object
df2 <- df %>% separate(Var2, into = paste0("Var2", 1:3), sep = ",") %>%
pivot_longer(cols = c(Var21, Var22, Var23), names_to = "name", values_to = "Var2") %>%
select(-name) %>%
filter(!(is.na(Var2) | Var2 == "")) %>%
group_by(Var2) %>%
mutate(Total = n_distinct(ID),
Blank_var1 = n_distinct(ID[is.na(Var1) | Var1 == "" | Var1 == "0"]),
Blank_var2 = n_distinct(ID[is.na(Var2) | Var2 == "" | Var2 == "0"]),
Blank_Both = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0")]),
Blank_new = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "New")]),
Blank_old = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "Old")]),
non_blank = Total - Blank_Both) %>%
select(-c(ID, Var1, Type)) %>%
pivot_longer(-Var2) %>%
pivot_wider(id_cols = name, names_from = Var2, values_from = "value", values_fn = min)
#third object
df3 <- df %>%
summarise(Total = n_distinct(ID),
Blank_var1 = n_distinct(ID[is.na(Var1) | Var1 == "" | Var1 == "0"]),
Blank_var2 = n_distinct(ID[is.na(Var2) | Var2 == "" | Var2 == "0"]),
Blank_Both = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0")]),
Blank_new = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "New")]),
Blank_old = n_distinct(ID[(is.na(Var1) | Var1 == "" | Var1 == "0") & (is.na(Var2) | Var2 == "" | Var2 == "0") & (Type == "Old")]),
non_blank = Total - Blank_Both) %>% pivot_longer(cols = 1:7, names_to = "Variable", values_to = "Excl_count") %>%
mutate(`%` = case_when(Variable == "Total" ~ "100.00%",
Variable %in% c("Blank_var1", "Blank_var2", "Blank_Both", "non_blank") ~ paste0(round(Excl_count*100/Excl_count[Variable == "Total"], 2), "%"),
Variable == "Blank_new" | Variable == "Blank_old" ~ paste0(round(Excl_count*100/Excl_count[Variable == "Blank_Both"], 2), "%")))
#max values in Var2
n1 <- 1 + max(str_count(df$Var2, ","))
#Tidy data
df_tidy <- df %>% separate(Var2, into = paste0("Var2", 1:n1)) %>%
pivot_longer(cols = -c(ID, Type), names_to = "Variable") %>%
mutate(Variable = if_else(str_detect(Variable, "Var2"), "Var2", Variable)) %>%
filter(!(is.na(value) | value == ""))
#fourth data object
df4 <- merge(df_tidy, df_tidy, by = c("ID", "Type"), all = T) %>%
filter(! value.x == value.y) %>%
select(ID, Variable = value.x, value.y) %>%
group_by(Variable, value.y) %>%
mutate(val = n_distinct(ID)) %>%
select(-ID) %>%
pivot_wider(id_cols = Variable, names_from = value.y, values_from = val, values_fn = min, values_fill = 0) %>%
mutate(Excl_count = NA,
`%` = NA)
#final
rbind(merge(df3, merge(df1, df2, by.x = "name", by.y = "name", sort = F),
by.x = "Variable", by.y = "name", sort = F), df4)
Variable Excl_count % A1 A2 A3 X1 X2 X3 X4 X8
1 Total 10 100.00% 3 5 2 6 4 5 1 1
2 Blank_var1 4 40% 0 0 0 1 1 1 0 0
3 Blank_var2 3 30% 0 0 0 0 0 0 0 0
4 Blank_Both 3 30% 0 0 0 0 0 0 0 0
5 Blank_new 1 33.33% 0 0 0 0 0 0 0 0
6 Blank_old 2 66.67% 0 0 0 0 0 0 0 0
7 non_blank 7 70% 3 5 2 6 4 5 1 1
8 A1 NA <NA> 0 2 2 3 2 3 0 0
9 X1 NA <NA> 3 4 2 0 4 5 1 0
10 X2 NA <NA> 2 2 1 4 0 4 0 0
11 X3 NA <NA> 3 3 2 5 4 0 0 0
12 A2 NA <NA> 2 0 2 4 2 3 1 1
13 X4 NA <NA> 0 1 0 1 0 0 0 0
14 A3 NA <NA> 2 2 0 2 1 2 0 0
15 X8 NA <NA> 0 1 0 0 0 0 0 0