r - 如何对 R 数据框进行采样,使其在多个变量中具有代表性?
问题描述
更新 6/16:
添加了调用验证函数的训练-测试拆分函数。添加了一个参数来为您的目标变量单独设置 alpha,以防您需要更高的标准。添加了库调用。
这是一个关于它的 Github:https ://github.com/KalebCoberly/train_test_split_R
结束更新
我想做一些类似于这篇关于 Pandas 数据框的帖子,但在 R 中,并且理想情况下无论数据类型如何(例如,具有因子和数字列的数据框)。
我想获得一个 R 数据框的随机样本,其中每个变量都相对代表总体。
我已经看到了基于单个变量创建分层样本的方法。但是,我想确保在多个列上的表示,而不仅仅是因素。
我编写了一个简单的算法来处理数值变量,对每个变量使用 Wilcoxon 检验。因此,如果样本(测试集)中的所有数字列似乎都来自与其余集(训练集)中的数字列相同的集合,那么您就有了一个相当有代表性的样本。您抽取一个随机样本并使用以下函数对其进行验证,然后重新采样和验证,直到您获得一个满足所有变量的最小代表性(由 alpha 测量)的样本。
在这种情况下,因为 alpha 表示错误拒绝原假设的风险(H0 = 样本并非来自显着不同的群体,即它们代表同一群体。),并且因为我们不想拒绝原假设,我们想要一个大于 alpha 而不是小于 alpha 的 p 值,并且我们想要一个尽可能高的 alpha。
library(tidyverse)
train_test_split = function(df, y_cols, id_cols, feats_lst, test_size = .3,
alpha = .5, target_alpha = .9, validate = TRUE) {
# Splits df into train/test sets and input/target (X/y) sets.
# (Must have id_col, but can be "dummy" since it's discarded for index.)
# Parameters:
# df: (data.frame) Full data set, including target variable(s).
# y_cols: (c(character)) Target column(s).
# id_cols: (c(character)) Id column(s) to drop, because df maintains index.
# test_size: (numeric) Proportion of rows to use for test set.
# (Does not validate.)
# alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
# H0 = feature n of train and of test do not represent different sets.
# (i.e. representative split)
# H1 = feature n of train and of test represent different supersets.
# target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
# if feature is in y_cols).
# validate: (bool) Should set split be validated?
# Return:
# split_lst: (list(data.frame)) (train_X, train_y, test_X, test_y)
# train_X (data.frame) Input features in training subset.
# train_y (data.frame) Target variable in training subset.
# test_X (data.frame) Input features in testing subset.
# test_y (data.frame) Target variable in testing subset.
split_lst = list(
'train_X' = data.frame(),
'train_y' = data.frame(),
'test_X' = data.frame(),
'test_y' = data.frame()
)
full_set_len = nrow(df)
test_set_len = as.integer(test_size * full_set_len)
###
### TO DO: Add a parameter and logic to choose whether to track this. ###
###
# To track average p-values of features:
feats_p_av_lst = vector(mode = 'list', length = length(feats_lst))
names(feats_p_av_lst) = feats_lst
# Split and validate until valid.
valid_split = FALSE
while (!valid_split) {
# Split randomly.
test_idx = sample(x = full_set_len, size = test_set_len)
split_lst$train_X = select(df[-test_idx, ], -all_of(y_cols))
split_lst$train_y = select(df[-test_idx, ], all_of(y_cols))
split_lst$train_y[id_cols] = split_lst$train_X[id_cols]
split_lst$test_X = select(df[test_idx, ], -all_of(y_cols))
split_lst$test_y = select(df[test_idx, ], all_of(y_cols))
split_lst$test_y[id_cols] = split_lst$test_X[id_cols]
# Validate the split.
if (validate) {
# Randomize test order to "cost-average" compute.
feats_lst = sample(feats_lst)
# Test X and y separately to avoid the join compute and data copies.
X_validation_results = validate_split(
train = split_lst$train_X,
test = split_lst$test_X,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = feats_p_av_lst,
alpha = alpha,
target_alpha = target_alpha
)
feats_p_av_lst = X_validation_results$p_vals
if (X_validation_results$valid){
y_validation_results = validate_split(
train = split_lst$train_y,
test = split_lst$test_y,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = feats_p_av_lst,
alpha = alpha,
target_alpha = target_alpha
)
feats_p_av_lst = y_validation_results$p_vals
if (y_validation_results$valid) {
valid_split = TRUE
} # else { print("Invalid y split. Resampling.") }
} # else { print("Invalid X split. Resampling.") }
} else {valid_split = TRUE}
}
if (validate) {
for(feat in names(feats_p_av_lst)) {
feats_p_av_lst[[feat]] = mean(feats_p_av_lst[[feat]])
}
print('Average p-values:')
print(feats_p_av_lst)
}
return(split_lst)
}
validate_split = function(train, test, feats_lst, y_cols, feats_p_val_lst,
alpha = .5, target_alpha = .9) {
# Conducts Wilcoxon ranks sum test column by column to test if train and test
# represent a similar superset. (i.e., is the split stratified on every
# feature?) Both train and test should have the same features. There should
# be at least one numeric (i.e. continuous) feature, as the test will only
# be performed on these columns -- this does limit the test.
# Parameters:
# train: (data.frame) A subset of original set to compare to the other
# subset, test.
# test: (data.frame) A subset of original set to compare to the other
# subset, train.
# feats_lst: (list(character)) List of features to test.
# y_cols: (c(character)) Vector of target features.
# feats_p_val_lst: (list(character:list(double)) Dictionary of p-values to
# to track which features are hardest to stratify.
# alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
# H0 = feature n of train and test does not represent different sets.
# (i.e. representative split)
# H1 = feature n of train and test represents a different superset.
# target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
# if feature is in y_cols).
# Return:
# list(valid: (bool), p_vals: (list(character:list(double)))
# valid: (bool) Are the sets representative of the same superset?
# p_vals: (list(character:list(double)) feats_p_val_lst updated
valid = TRUE
for (feat in feats_lst) {
if (valid & feat %in% colnames(train) & feat %in% colnames(test)) {
this_alpha = alpha
if (feat %in% y_cols) {
this_alpha = target_alpha
}
results = wilcox.test(
x = as.double(train[[feat]]),
y = as.double(test[[feat]])
)
if (!(results$p.value > this_alpha)) {
# print("Reject null hypothesis that split is not unrepresentative:")
valid = FALSE
}
# print(feat)
# print(results$p.value)
feats_p_val_lst[[feat]] = c(feats_p_val_lst[[feat]], results$p.value)
}
}
return(list('valid' = valid, 'p_vals' = feats_p_val_lst))
}
在虚拟数据上进行测试:
sample_df = data.frame(
list(
'Id' = c(1:1000),
'y' = as.double(sample(1:1000, size = 1000)),
'a' = as.double(sample(1:2000, size = 1000)),
'b' = as.double(sample(1:3000, size = 1000))
)
)
y_cols = c('y'),
id_cols = c('Id'),
feats_lst = colnames(select(sample_df, where(is.double)))
split_lst = train_test_split(
df = sample_df,
y_cols = y_cols,
id_cols = id_cols,
feats_lst = feats_lst
)
# > names(split_lst)
# [1] "train_X" "train_y" "test_X" "test_y"
# You can call validate_split again on your found split to
# get your final p-values for each feature.
feats_p_val_lst = vector(mode = 'list', length = length(feats_lst))
names(feats_p_val_lst) = feats_lst
validate_split_lst = validate_split =(
train = split_lst$train_X,
test = split_lst$test_X,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = feats_p_val_lst
)
validate_split_lst = validate_split =(
train = split_lst$train_y,
test = split_lst$test_y,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = validate_split_lst$p_vals
)
> validate_split_lst$p_vals
# A list of all your feature names with their p-values.
> validate_split_lst$valid
TRUE
同样,这完全忽略了因子和整数,除非您将它们转换为双精度数,但这将违反 Wilcoxon 假设数据是连续的。
鉴于我当前的数据集包含大约 80 个变量,其中几乎一半是双精度数,这就足够了,因为如果所有双精度数都是,这些因素可能非常具有代表性。
但是,它需要很长时间才能运行并获得甚至 p > .5(即无法拒绝这些数据集不是来自不同人群(即并非不具代表性)的零假设)。而且,如果一个数据集的所有或大部分变量都是因子或整数呢?
有没有更好的方法,无论是/从数学/统计角度和/或 R/编程角度吗?另外,这对机器学习有什么问题吗?我想认为它会提高训练/调整模型的普遍性,减少过度拟合的机会,但它会以某种方式造成泄漏或其他问题吗?
解决方案
推荐阅读
- amazon-web-services - AWS Redshift“查询结果未准备好,状态:已完成”
- javascript - 串联和并联的 JS 事件
- php - 如何使用没有 html 标签的 html 模板发送 php 电子邮件?
- python - 如何在 C++ 中将地图元素推送到双端队列
- selenium - 如何运行一种方法直到驱动程序在 Selenium 中关闭
- python - Scrapy 404 错误:抓取网页时未处理或不允许 HTTP 状态代码
- autobahn - Autobahn.js 中的票证身份验证
- machine-learning - 如何为 Google AI Platform 上的 Keras 多输入模型格式化数据?
- reactjs - typescript 使用 actionType 作为子组件中的属性对 redux 和 redux-thunk 做出反应
- concurrency - 为什么在建立 httpSession 后 Wildfly 服务器不能处理并发 REST 请求