首页 > 解决方案 > 基于使用函数的某些计算在数据框中创建附加列时出错

问题描述

我有一个数据框'd3'(其中 dput 如下):

structure(list(x1 = c(12.800454545, 17.71, 5.805, 13.111875, 14.121428571, 12.800454545, 17.71, 5.805, 13.111875, 14.121428571), 
x2 = c(281.61, 230.23, 11.61, 209.79, 296.55, 281.61, 230.23, 11.61, 209.79, 296.55), 
x3 = c(19.41, 13.91, 0, 2.37, 23.49, 19.41, 13.91, 0, 2.37, 23.49), 
x4 = c(65L, 62L, 3L, 41L, 45L, 65L, 62L, 3L, 41L, 45L), 
x5 = c(0.571428571, 1.857142857, 21.14285714, 2.571428571, 1.428571429, 0.571428571, 1.857142857, 21.14285714, 2.571428571, 1.428571429), 
x6 = c(52L, 40L, 3L, 22L, 33L, 52L, 40L, 3L, 22L, 33L), 
x7 = c(44.53, 15.38, 5.97, 4.97, 13.94, 44.53, 15.38, 5.97, 4.97, 13.94), 
x8 = c(65L, 53L, 3L, 41L, 45L, 65L, 53L, 3L, 41L, 45L), 
x9 = c(6L, 4L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 1L), 
x10 = c(46.43, 17.52, 0, 11.73, 0, 46.43, 17.52, 0, 11.73, 0)), 
row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), class = "data.frame")

我想将列添加到 df_dummy (它是 d3 的副本),这样总共应该有 10 + (5 * 10) = 60 列。

我尝试如下:

library(Hmisc)
df_dummy <- d3

for (i in 1:length(d3)){
  aa <- NULL
  bb <- NULL

  aa <- as.integer(cut2(d3[,i], g=5))

  # Create dummy variables
  bb <- model.matrix(~ aa + 0, data=d3)
  colnames(bb) <- gsub("aa", paste0(names(d3[i]),"_D",i), colnames(bb))     #clean column names
  bb <- as.data.frame(bb)   # convert matrix to dataframe

  # add dummy columns to the original static dataset
  df_dummy <- cbind(df_dummy, bb)
  #dim(df_static_dummy)
  rm(aa)
  rm(bb)
}

它返回给我的 df_dummy 包含以下列:

x1  x2  x3  x4  x5  x6  x7  x8  x9  x10 x1_D1   x2_D2   x3_D3   x4_D4   x5_D5   x6_D6   x7_D7   x8_D8   x9_D9   x10_D10

而不是我所期望的:

x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x1_D1, x1_D2, x1_D3.... x1_D10, x2_D1, x2_D2, x2_D3.... x2_D10... so forth

标签: rdataframedplyrdummy-variable

解决方案


试试这个...如果还没有安装包...

library(fastDummies)
library(Hmisc)

fdf <- d3
for (i in 1:ncol(d3)){
  #d3$rank <- as.integer(cut2(d3[,1], g=5))
  d3$rank <- as.integer(cut2(d3[,i], g=5))
  results <- dummy_cols(d3, select_columns = "rank")
  d3$rank <- NULL
  results <- results[12:ncol(results)]
  names(results) <- paste0("x",i,"_",names(results))
  fdf <- cbind(fdf, results)
  rm(results)
}

根据您的数据,我生成了以下列:

names(fdf)
 [1] "x1"         "x2"         "x3"         "x4"         "x5"         "x6"         "x7"         "x8"         "x9"        
[10] "x10"        "x1_rank_2"  "x1_rank_5"  "x1_rank_1"  "x1_rank_3"  "x1_rank_4"  "x2_rank_4"  "x2_rank_3"  "x2_rank_1" 
[19] "x2_rank_2"  "x2_rank_5"  "x3_rank_4"  "x3_rank_3"  "x3_rank_1"  "x3_rank_2"  "x3_rank_5"  "x4_rank_5"  "x4_rank_4" 
[28] "x4_rank_1"  "x4_rank_2"  "x4_rank_3"  "x5_rank_1"  "x5_rank_3"  "x5_rank_5"  "x5_rank_4"  "x5_rank_2"  "x6_rank_5" 
[37] "x6_rank_4"  "x6_rank_1"  "x6_rank_2"  "x6_rank_3"  "x7_rank_5"  "x7_rank_4"  "x7_rank_2"  "x7_rank_1"  "x7_rank_3" 
[46] "x8_rank_5"  "x8_rank_4"  "x8_rank_1"  "x8_rank_2"  "x8_rank_3"  "x9_rank_3"  "x9_rank_2"  "x9_rank_1"  "x10_rank_4"
[55] "x10_rank_3" "x10_rank_1" "x10_rank_2"

推荐阅读