首页 > 解决方案 > 如何从R中的数据框中随机获取一些行后获取其余行

问题描述

我有 2 个数据框df_1df_2. 现在我必须从中随机选择一些行df_1,然后将rest of the rows (which not selected randomly) from df_1with合并df_2

我正在使用此代码

set.seed(9999)
df_1 <- # the whole dataset
test_dataset1 <- sample_n(df_1, 10)
train_part_1 <- df_1[which(!df_1 %in% test_dataset1)] # Not working
train_1 <- rbind(df_2, train_part_1)

但是,当我尝试提取未随机选择的行时。我的代码不起作用。我得到与df_1手段相同的数据20 rows (same dataset)

编辑:实际上,我必须制作3 test3 train数据集。那么,我怎样才能使用该seed函数来获取相同的数据集以进行复制呢?

可重现的数据(仅 df_1):

structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L, 
                                   9L, 3L, 4L, 2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L), .Label = c("ID00309", 
                                                                                                   "ID00361", "ID00541", "ID00570", "ID00615", "ID00696", "ID00762", 
                                                                                                   "ID01200", "ID05109"), class = "factor"), nodeB = structure(c(8L, 
                                                                                                                                                                 3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L, 9L, 7L, 4L, 4L, 6L, 9L, 6L, 
                                                                                                                                                                 7L, 5L, 5L), .Label = c("ID00361", "ID00541", "ID00570", "ID00615", 
                                                                                                                                                                                         "ID00696", "ID01200", "ID05109", "ID11641", "ID11691"), class = "factor"), 
               scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472", 
                                                "1.90762235378507", "1.94364188077133", "1.95883206119256", 
                                                "2.08440437841349", "2.26408172709962", "2.3223132020942", 
                                                "2.46120775935034", "2.49647215035727", "2.50432367561777", 
                                                "2.57541320006514", "2.65099330092281", "2.75209155741549", 
                                                "2.93717640337986", "2.99596628688011", "3.21209741517806", 
                                                "3.21997803385465", "3.48788394772132", "3.81389707587156"
               ), class = "factor")), class = "data.frame", row.names = c(NA, 
                                                                          -20L))

标签: rrandomdplyrrandom-seed

解决方案


使用随机行号获取样本并-用于获取逆:

df_1 <- structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L, 9L, 3L, 4L, 
                                         2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L), 
                                       .Label = c("ID00309", "ID00361", "ID00541", 
                                                  "ID00570", "ID00615", "ID00696", 
                                                  "ID00762", "ID01200", "ID05109"), 
                                       class = "factor"), 
                     nodeB = structure(c(8L, 3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L, 
                                         9L, 7L, 4L, 4L, 6L, 9L, 6L, 7L, 5L, 5L), 
                                       .Label = c("ID00361", "ID00541", "ID00570", 
                                                  "ID00615", "ID00696", "ID01200", 
                                                  "ID05109", "ID11641", "ID11691"), 
                                       class = "factor"), 
                     scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472", 
                                                      "1.90762235378507", "1.94364188077133", 
                                                      "1.95883206119256", "2.08440437841349", 
                                                      "2.26408172709962", "2.3223132020942", 
                                                      "2.46120775935034", "2.49647215035727", 
                                                      "2.50432367561777", "2.57541320006514", 
                                                      "2.65099330092281", "2.75209155741549", 
                                                      "2.93717640337986", "2.99596628688011", 
                                                      "3.21209741517806", "3.21997803385465", 
                                                      "3.48788394772132", "3.81389707587156"
                     ), class = "factor")), 
                class = "data.frame", row.names = c(NA, -20L))

set.seed(9999)
Selected <- sample.int(nrow(df_1), 10)
# index selected the row; use [col,row] pattern to select rows
test_dataset1 <- df_1[ Selected, ] 
# use -index to remove rows
train_part_1  <- df_1[-Selected, ] 

test_dataset1
#>      nodeA   nodeB              scr
#> 6  ID00570 ID05109 2.93717640337986
#> 9  ID00541 ID11641 2.57541320006514
#> 19 ID00361 ID00696 1.90444166064472
#> 3  ID00309 ID00570 3.21997803385465
#> 10 ID00570 ID01200 2.50432367561777
#> 2  ID00361 ID00570 3.48788394772132
#> 20 ID00309 ID00696 1.85284606048794
#> 8  ID05109 ID11641 2.65099330092281
#> 12 ID01200 ID05109 2.46120775935034
#> 18 ID00696 ID05109 1.90762235378507
train_part_1
#>      nodeA   nodeB              scr
#> 1  ID00570 ID11641 3.81389707587156
#> 4  ID00309 ID00361 3.21209741517806
#> 5  ID00309 ID00541 2.99596628688011
#> 7  ID00309 ID11691 2.75209155741549
#> 11 ID00361 ID11691 2.49647215035727
#> 13 ID00361 ID00615  2.3223132020942
#> 14 ID00309 ID00615 2.26408172709962
#> 15 ID00615 ID01200 2.08440437841349
#> 16 ID00762 ID11691 1.95883206119256
#> 17 ID00541 ID01200 1.94364188077133

reprex 包于 2021-03-14 创建(v1.0.0)


推荐阅读