mat <- matrix(c(1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,
                1,1,0,0,0,0,0,0,1,0,1,2,1,0,0,0), nrow=16, ncol=6)
dimnames(mat) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
                        "p", "q", "s", "t", "u", "v","x", "z"), 
                      c("1", "2", "3", "4", "5", "6"))

我想对列进行分组或分箱,然后为每个组聚合数据。对大小为 x 的 bin 重复采样 n 次。对于 x+1 的 bin 大小,将重复此过程。

对于第一次迭代,两个随机列被分箱。我想在不替换的情况下进行采样,这样两列的组合不会被采样两次(但是,如果一列与不同的列配对,则可以使用两次)。聚合被定义为计算合并列的行总和。聚合结果将作为该 bin 大小的结果矩阵中的新列添加。结果矩阵中的列数将限制为随机采样的 bin 数。

垃圾箱大小继续变得越来越大。对于下一次迭代,bin 大小增加到 3,以便聚合 3 个随机选择的列。聚合数据将被放入不同的结果矩阵中。这个过程将一直持续到 bin 达到数据帧的大小为止,在这种情况下,重新采样是不可能的。所有结果矩阵将被放入矩阵列表中。

下面是resultList给定上述矩阵的前两个 bin 大小的预期结果。

# Bin size =2 
# The randomly sampled columns are columns 1&2, 2&3, 3&4, 4&5, 5&6. 
mat1 <- matrix(c(3,0,0,0,1,0,1,0,0,0,0,0,0,0,2,0,
                 1,1,0,0,1,0,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat1) <- list(c("a", "c", "f", "h", "i", "j", "l", "m", 
                         "p", "q", "s", "t", "u", "v","x", "z"), 
                       c("1_2", "2_3", "3_4", "4_5", "5_6"))

# Bin size= 3
# The randomly selected columns to be joined are columns 1,2&3, 
# 2,3&4, 3,4&5, 4,5&6. 
mat2 <- matrix(c(3,0,1,1,2,0,1,0,0,0,0,0,0,0,3,0,
                 1,2,0,0,1,1,0,1,1,1,2,2,1,1,0,1), nrow=16)
dimnames(mat2) <- list(c("a", "c", "f", "h", "i", "j", "l", "m",
                         "p", "q", "s", "t", "u", "v","x", "z"), 
                       c("1_2_3", "2_3_4", "3_4_5", "4_5_6"))
resultList <- list(mat1, mat2)

我在这里发布了一个关于替代分箱技术的类似问题:Bin columns and aggregate data via random sample with replacement for iteratively large bin size


lapply(seq_len(ncol(mat) - 1), function(j) 
          lapply(sample(ncol(mat) - j, size= ), function(i) 
            rowSums(mat[, i:(i - j)]))))

#Get column names of the matrices
all_cols <- colnames(mat)

#Select bin value from 2:ncol(mat)
total_out <- lapply(seq_len(ncol(mat))[-1], function(j) {
      #Create all combinations taking j items at a time
      temp <- combn(all_cols, j, function(x) 
              #Take rowSums for the current combination
              #Also paste column names to assign column names later
              list(rowSums(mat[, x]), paste0(x, collapse = "_")), simplify = FALSE)
      #Combine rowSums matrix 
      new_mat <- sapply(temp, `[[`, 1)
      #Assign column names
      colnames(new_mat) <- sapply(temp, `[[`, 2)
      #Return new matrix


#  1_2 1_3 1_4 1_5 1_6 2_3 2_4 2_5 2_6 3_4 3_5 3_6 4_5 4_6 5_6
#a   3   1   1   1   2   2   2   2   3   0   0   1   0   1   1
#c   0   0   1   0   1   0   1   0   1   1   0   1   1   2   1
#f   0   1   0   0   0   1   0   0   0   1   1   1   0   0   0
#h   0   1   0   0   0   1   0   0   0   1   1   1   0   0   0
#i   1   1   0   1   0   2   1   2   1   1   2   1   1   0   1
#j   0   0   1   0   0   0   1   0   0   1   0   0   1   1   0
#l   1   1   1   1   1   0   0   0   0   0   0   0   0   0   0
#m   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#p   0   0   0   0   1   0   0   0   1   0   0   1   0   1   1
#q   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#s   0   0   0   1   1   0   0   1   1   0   1   1   1   1   2
#t   0   0   0   0   2   0   0   0   2   0   0   2   0   2   2
#u   0   0   0   0   1   0   0   0   1   0   0   1   0   1   1
#v   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#x   3   2   2   2   2   1   1   1   1   0   0   0   0   0   0
#z   0   0   0   1   0   0   0   1   0   0   1   0   1   0   1
#  1_2_3_4_5_6
#a           4
#c           2
#f           1
#h           1
#i           3
#j           1
#l           1
#m           1
#p           1
#q           1
#s           2
#t           2
#u           1
#v           1
#x           3
#z           1

请注意,共有 5 ( ncol - 1) 个矩阵total_out,列数为

#[1] 5

sapply(total_out, ncol)
#[1] 15 20 15  6  1


total_out <- total_out[-length(total_out)]

lapply(total_out, function(x) {
     nc <- ncol(x)
     x[, sample(nc, ceiling(nc/2))]
