首页 > 解决方案 > R总结折叠的Data.Table

问题描述

我有这样的数据

data <- data.table(
 "School" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 
              1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
 "Grade"  = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 
              0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),
 "CAT"    = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 
              0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
 "FOX"    = c(1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 
              1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
 "DOG"    = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 
              0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)

并希望实现一个新的数据表,例如:

dataWANT <- data.frame(
  "VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
  "SCHOOL" = c(1, 1, 0, 1, 1, 0, 1, 1, 0),
  "GRADE"  = c(0, 1, 1, 0, 1, 1, 0, 1, 1),
  "MEAN"   = c(NA)
)

dataWANT当它们等于 1 时,取CATFOXDOG、和X的平均值。SCHOOLGRADESCHOOLGRADE

我知道如何一次做到这一点,但这不利于使用大数据。

data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, FOX1 := mean(FOX), by = list(GRADE)]
data[, DOG1 := mean(DOG), by = list(SCHOOL, GRADE)]
    
data$CAT2 = unique(data[SCHOOL == 1, CAT1])
data$FOX2 = unique(data[GRADE == 1, FOX1])
data$DOG2 = unique(data[SCHOOL == 1 & GRADE == 1, DOG1])

请只使用这个:

data <-  data.table(
  "SCHOOL" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 
               1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
  "GRADE"  = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 
               0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),       
  "CAT"    = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
               0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
  "FOX"    = c(1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
               1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
  "DOG"    = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
               0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)
    
    
data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, CAT2 := mean(CAT), by = list(GRADE)]
data[, CAT3 := mean(CAT), by = list(SCHOOL, GRADE)]

data[, FOX1 := mean(FOX), by = list(SCHOOL)]
data[, FOX2 := mean(FOX), by = list(GRADE)]
data[, FOX3 := mean(FOX), by = list(SCHOOL, GRADE)]

data[, DOG1 := mean(DOG), by = list(SCHOOL)]
data[, DOG2 := mean(DOG), by = list(GRADE)]
data[, DOG3 := mean(DOG), by = list(SCHOOL, GRADE)]


dataWANT <- data.frame(
  "VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
  "TYPE"     = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
  "MEAN"     = c(0.48, 0.44, 0.428, 0.6, 0.611, 0.6428, 0.52, 0.61, 0.6428)
)

其中:当由 估计
TYPE时等于 1,当MEAN由 估计SCHOOL
TYPE等于 2 ,当由和MEAN估计GRADE
TYPE等于 3MEANSCHOOLGRADE

标签: rdata.tablesummary

解决方案


我们可以使用rbindlistafter 创建 alist 通过获取MEANaftermelt数据集(如在另一篇文章中)

library(data.table)
cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('SCHOOL', 'GRADE', c('SCHOOL', 'GRADE'))
lst1 <- lapply(list_cols, function(x)  
       data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])
rbindlist(lapply(lst1, function(x)  {
     nm1 <- setdiff(names(x), c('variable', 'MEAN'))
     x[Reduce(`&`, lapply(mget(nm1), as.logical)),
     .(VARIABLE = variable, MEAN)]}), idcol = 'TYPE')[order(VARIABLE)]
#   TYPE VARIABLE      MEAN
#1:    1      CAT 0.4800000
#2:    2      CAT 0.4444444
#3:    3      CAT 0.4285714
#4:    1      FOX 0.6000000
#5:    2      FOX 0.5555556
#6:    3      FOX 0.6428571
#7:    1      DOG 0.5200000
#8:    2      DOG 0.6111111
#9:    3      DOG 0.6428571

推荐阅读