首页 > 解决方案 > r计算数据框中多列的综合得分和可靠性

问题描述

我想为我的数据框中的许多项目计算综合得分和 cronbach 的 alpha 并将它们存储在一个新的数据框中。
这是我当前数据框的(一部分):

structure(list(T1PP_1 = c(6, 7, 4, 5, 4, 6, 5, 6, 6, 5), T1PP_2 = c(3, 
4, 4, 5, 2, 5, 5, 6, 6, 3), T1PP_3 = c(5, 7, 6, 7, 6, 7, 6, 5, 
6, 5), T1PP_4 = c(3, 6, 5, 5, 6, 5, 4, 6, 6, 4), T1PP_5 = c(4, 
6, 5, 6, 5, 3, 6, 3, 5, 4), T1PP_7 = c(4, 6, 5, 5, 4, 7, 4, 5, 
6, 2), T1PP_8 = c(5, 6, 4, 6, 4, 2, 4, 5, 5, 5), T1PP_9 = c(5, 
6, 5, 6, 4, 5, 3, 7, 5, 6), T1PP_10 = c(3, 6, 3, 4, 5, 2, 3, 
6, 6, 3), T1PP_11 = c(5, 6, 4, 5, 3, 1, 5, 3, 5, 2), t1se_1 = c(4, 
5, 4, 4, 4, 1, 5, 4, 4, 4), t1se_2 = c(3, 5, 4, 5, 4, 1, 5, 2, 
4, 4), t1se_3 = c(4, 4, 4, 4, 3, 4, 5, 4, 5, 4), t1se_4 = c(3, 
5, 4, 5, 4, 4, 5, 4, 5, 4), t1se_5 = c(4, 5, 4, 4, 4, 4, 5, 5, 
4, 4), t1se_6 = c(4, 5, 4, 5, 4, 4, 5, 5, 5, 4), t1se_7 = c(4, 
5, 3, 5, 4, 4, 5, 5, 5, 3), t1se_8 = c(3, 5, 3, 4, 4, 3, 5, 5, 
5, 4), t1ogoal_1 = c(4, 5, 4, 5, 5, 5, 5, 5, 2, 5), t1ogoal_2 = c(4, 
4, 4, 5, 5, 5, 5, 4, 4, 5), t1ogoal_3 = c(4, 5, 4, 5, 4, 3, 4, 
2, 4, 5), t1ogoal_4 = c(4, 5, 3, 4, 2, 3, 3, 1, 2, 4), t1ogoal_5 = c(4, 
5, 3, 5, 5, 5, 4, 2, 3, 5), t1ogoal_6 = c(4, 5, 5, 4, 5, 5, 3, 
5, 4, 5), t1ogoal_7 = c(4, 5, 5, 5, 5, 5, 5, 5, 4, 5)), row.names = c(NA, 
10L), class = "data.frame")

基于示例数据框,新数据框应如下所示:

structure(list(T1PP_comp = c(2.4, 5.4, 3.1, 4.9, 4.2, 4.6, 4.1, 
4.1, 4, 4.8), T1PP_alpha = c(2.4, 5.4, 3.1, 4.9, 4.2, 4.6, 4.1, 
4.1, 4, 4.8), t1se_comp = c(2.375, 2.75, 1.625, 3.875, 2.625, 
2.625, 3.5, 3.5, 2.375, 3.5), t1se_alpha = c(2.375, 2.75, 1.625, 
3.875, 2.625, 2.625, 3.5, 3.5, 2.375, 3.5), t1ogoal_comp = c(1.4, 
3.5, 2.6, 2.7, 2.5, 2.6, 3, 3, 2.6, 3.2)), row.names = c(NA, 
10L), class = "data.frame")

所以我想要的是遍历属于一起的多个列(即,形成一个变量,如 T1PP_1 到 T1PP_11)以获得综合得分和 cronbach 的 alpha。这是最初的尝试:

comp_and_alph <- function(data = my_dat, variable_name, ...) {
  data %>%
    select(matches(variable_name)) %>%
      mutate(comp = composite(., nomiss = 0.8),
      alpha = psych::alpha(., ...)$scores) %>%
    rename_at(vars(c("comp", "alpha")), ~paste(variable_name, .,sep = "_"))
}

comp_and_alph_all <- function(data, variables, ...){
  res <- lapply(variables, function(v){
    comp_and_alph(data, v, ...)
  })
  Reduce(function(x, y){merge(x, y)}, init = list(data), res)
}

问题是我的数据框有大约 350 行和 200 多个列(项目),来自大约 40 个变量。当我运行上面的代码(并添加超过前三个变量)时,我的内存不足:

comp_and_alph_all(my_dat, c("T1PP_", "t1se_", "t1ogoal_", "t1TFPa_", "t1TFPr_"))

错误:向量内存耗尽(达到限制?)

现在我想知道是否有更有效的解决方案?谢谢!

标签: r

解决方案


于是我找到了一个简单的解决方案:
这里是我真实数据框的前 20 行,所以你看懂了下面的代码:

structure(list(durationt1 = 511, t1date_diff = 811, t1pa_1 = 4, 
    t1pa_2 = 5, t1pa_3 = 5, t1pa_4 = 2, t1pa_5 = 3, t1pa_6 = 4, 
    t1pa_7 = 4, t1pa_8 = 3, t1pa_9 = 4, t1pa_10 = 4, t1na_1 = 1, 
    t1na_2 = 1, t1na_3 = 3, t1na_4 = 1, t1na_5 = 1, t1na_6 = 1, 
    t1na_7_fa_2 = 3, t1na_8 = 1, t1na_9 = 1, t1na_10 = 1, t1fa_1 = 4, 
    t1fa_3 = 1, t1pp_1 = 5, t1pp_2 = 4, t1pp_3 = 6, t1pp_4 = 5, 
    t1pp_5 = 4, t1pp_7 = 5, t1pp_8 = 5, t1pp_9 = 5, t1pp_10 = 4, 
    t1pp_11 = 4, t1se_1 = 3, t1se_2 = 3, t1se_3 = 4, t1se_4 = 4, 
    t1se_5 = 4, t1se_6 = 4, t1se_7 = 3, t1se_8 = 4, t1ogoal_1 = 4, 
    t1ogoal_2 = 3, t1ogoal_3 = 3, t1ogoal_4 = 2, t1ogoal_5 = 3, 
    t1ogoal_6 = 4, t1ogoal_7 = 4, t1ogoal_9 = 3, t1ogoal_10 = 4, 
    t1ogoal_11 = 4, t1tfpa_1 = 4, t1tfpa_2 = 4, t1tfpa_3 = 4, 
    t1tfpa_4 = 4, t1tfpr_1 = 5, t1tfpr_2 = 4, t1tfpr_3 = 5, t1tfpr_4 = 5, 
    t1tffu_1 = 5, t1tffu_2 = 5, t1tffu_3 = 5, t1tffu_4 = 5, t1cpl_1 = 3, 
    t1cpl_2 = 3, t1cpl_3 = 3, t1cpl_4 = 4, t1cpl_5 = 3, t1cpl_6 = 3, 
    t1eff = 4, t1search_1 = 5, t1search_2 = 5, t1search_3 = 6, 
    t1wor_1 = 3, t1wor_2 = 2, t1wor_3 = 1, t1wor_4 = 1, t1scom_1 = 3, 
    t1scom_2 = 3, t1scom_3 = 3, t1angra = 9, t1anful = 9, t1anune = 70, 
    t1anpar = 10, t1ansel = 10, t1anint = 0, t1gaemp_1 = 5, t1gaemp_2 = 5, 
    t1gaemp_3 = 5, t1gaemp_4 = 5, t1gaemn_1 = 2, t1gaemn_2 = 2, 
    t1gaemn_3 = 1, t1gaemn_4 = 1, t1jaemp_1 = 5, t1jaemp_2 = 5, 
    t1jaemp_3 = 5, t1jaemp_4 = 5, t1jaemn_1 = 5, t1jaemn_2 = 3, 
    t1jaemn_3 = 1, t1jaemn_4 = 3, t1chjf_1 = 4, t1chjf_2 = 4, 
    t1chjf_3 = 4, t1hajf_1 = 2, t1hajf_2 = 3, t1hajf_3 = 2, t1chjs_1 = 5, 
    t1chjs_2 = 5, t1chjs_3 = 5, t1hajs_1 = 1, t1hajs_2 = 2, t1hajs_3 = 3, 
    t1heal1 = 3, t1sex = 1, t1age = 51, t1lang = 1, t1preint = 0, 
    t1presu = 0, t1prevo = 0, t1prept = 1, t1preft = 1, t1prese = 0, 
    t1preot = 0, t1stime = 2, t2job = NA_real_, t2pa_1 = NA_real_, 
    t2pa_2 = NA_real_, t2pa_3 = NA_real_, t2pa_4 = NA_real_, 
    t2pa_5 = NA_real_, t2pa_6 = NA_real_, t2pa_7 = NA_real_, 
    t2pa_8 = NA_real_, t2pa_9 = NA_real_, t2pa_10 = NA_real_, 
    t2na_1 = NA_real_, t2na_2 = NA_real_, t2na_3 = NA_real_, 
    t2na_4 = NA_real_, t2na_5 = NA_real_, t2na_6 = NA_real_, 
    t2na_7_fa_2 = NA_real_, t2na_8 = NA_real_, t2na_9 = NA_real_, 
    t2na_10 = NA_real_, t2fa_1 = NA_real_, t2fa_3 = NA_real_, 
    t2search_1 = NA_real_, t2search_2 = NA_real_, t2search_3 = NA_real_, 
    t2eff = NA_real_, t2empse_1 = NA_real_, t2empse_2 = NA_real_, 
    t2empse_3 = NA_real_, t2se_1 = NA_real_, t2se_2 = NA_real_, 
    t2se_3 = NA_real_, t2se_4 = NA_real_, t2se_5 = NA_real_, 
    t2se_6 = NA_real_, t2se_7 = NA_real_, t2se_8 = NA_real_, 
    t2wor_1 = NA_real_, t2wor_2 = NA_real_, t2wor_3 = NA_real_, 
    t2wor_4 = NA_real_, t2scom_1 = NA_real_, t2scom_2 = NA_real_, 
    t2scom_3 = NA_real_, t2cpl_1 = NA_real_, t2cpl_2 = NA_real_, 
    t2cpl_3 = NA_real_, t2cpl_4 = NA_real_, t2cpl_5 = NA_real_, 
    t2cpl_6 = NA_real_, t2angra = NA_real_, t2anful = NA_real_, 
    t2anune = NA_real_, t2anpar = NA_real_, t2ansel = NA_real_, 
    t2anint = NA_real_, t2gaemp_1 = NA_real_, t2gaemp_2 = NA_real_, 
    t2gaemp_3 = NA_real_, t2gaemp_4 = NA_real_, t2gaemn_1 = NA_real_, 
    t2gaemn_2 = NA_real_, t2gaemn_3 = NA_real_, t2gaemn_4 = NA_real_, 
    t2jaemp_1 = NA_real_, t2jaemp_2 = NA_real_, t2jaemp_3 = NA_real_, 
    t2jaemp_4 = NA_real_, t2jaemn_1 = NA_real_, t2jaemn_2 = NA_real_, 
    t2jaemn_3 = NA_real_, t2jaemn_4 = NA_real_, t2chjf_1 = NA_real_, 
    t2chjf_2 = NA_real_, t2chjf_3 = NA_real_, t2hajf_1 = NA_real_, 
    t2hajf_2 = NA_real_, t2hajf_3 = NA_real_, t2chjs_1 = NA_real_, 
    t2chjs_2 = NA_real_, t2chjs_3 = NA_real_, t2hajs_1 = NA_real_, 
    t2hajs_2 = NA_real_, t2hajs_3 = NA_real_, t2heal1 = NA_real_, 
    j3job = NA_real_, t3job_1 = NA_real_, t3pa_1 = NA_real_, 
    t3pa_2 = NA_real_, t3pa_3 = NA_real_, t3pa_4 = NA_real_, 
    t3pa_5 = NA_real_, t3pa_6 = NA_real_, t3pa_7 = NA_real_, 
    t3pa_8 = NA_real_, t3pa_9 = NA_real_, t3pa_10 = NA_real_, 
    t3na_1 = NA_real_, t3na_2 = NA_real_, t3na_3 = NA_real_, 
    t3na_4 = NA_real_, t3na_5 = NA_real_, t3na_6 = NA_real_, 
    t3na_7_fa_2 = NA_real_, t3na_8 = NA_real_, t3na_9 = NA_real_, 
    t3na_10 = NA_real_, t3fa_1 = NA_real_, t3fa_3 = NA_real_, 
    t3empse_1 = NA_real_, t3empse_2 = NA_real_, t3empse_3 = NA_real_, 
    t3tfpa_1 = NA_real_, t3tfpa_2 = NA_real_, t3tfpa_3 = NA_real_, 
    t3tfpa_4 = NA_real_, t3tfpr_1 = NA_real_, t3tfpr_2 = NA_real_, 
    t3tfpr_3 = NA_real_, t3tfpr_4 = NA_real_, t3tffu_1 = NA_real_, 
    t3tffu_2 = NA_real_, t3tffu_3 = NA_real_, t3tffu_4 = NA_real_, 
    t3se_1 = NA_real_, t3se_2 = NA_real_, t3se_3 = NA_real_, 
    t3se_4 = NA_real_, t3se_5 = NA_real_, t3se_6 = NA_real_, 
    t3se_7 = NA_real_, t3se_8 = NA_real_, t3pofit_1 = NA_real_, 
    t3pofit_2 = NA_real_, t3pofit_3 = NA_real_, t3nsfit_1 = NA_real_, 
    t3nsfit_2 = NA_real_, t3nsfit_3 = NA_real_, t3dafit_1 = NA_real_, 
    t3dafit_2 = NA_real_, t3dafit_3 = NA_real_, t3jobsa_1 = NA_real_, 
    t3jobsa_2 = NA_real_, t3jobsa_3 = NA_real_, t3mean_1 = NA_real_, 
    t3mean_2 = NA_real_, t3mean_3 = NA_real_, t3mean_4 = NA_real_, 
    t3mean_5 = NA_real_, t3mean_7 = NA_real_, t3angra = NA_real_, 
    t3anful = NA_real_, t3anpar = NA_real_, t3ansel = NA_real_, 
    t3anint = NA_real_, t3anune = NA_real_, t3heal1 = NA_real_, 
    j4job = NA_real_, t4job_1 = NA_real_, t4pa_1 = NA_real_, 
    t4pa_2 = NA_real_, t4pa_3 = NA_real_, t4pa_4 = NA_real_, 
    t4pa_5 = NA_real_, t4pa_6 = NA_real_, t4pa_7 = NA_real_, 
    t4pa_8 = NA_real_, t4pa_9 = NA_real_, t4pa_10 = NA_real_, 
    t4na_1 = NA_real_, t4na_2 = NA_real_, t4na_3 = NA_real_, 
    t4na_4 = NA_real_, t4na_5 = NA_real_, t4na_6 = NA_real_, 
    t4na_7_fa_2 = NA_real_, t4na_8 = NA_real_, t4na_9 = NA_real_, 
    t4na_10 = NA_real_, t4fa_1 = NA_real_, t4fa_3 = NA_real_, 
    t4tfpa_1 = NA_real_, t4tfpa_2 = NA_real_, t4tfpa_3 = NA_real_, 
    t4tfpa_4 = NA_real_, t4tfpr_1 = NA_real_, t4tfpr_2 = NA_real_, 
    t4tfpr_3 = NA_real_, t4tfpr_4 = NA_real_, t4tffu_1 = NA_real_, 
    t4tffu_2 = NA_real_, t4tffu_3 = NA_real_, t4tffu_4 = NA_real_, 
    t4se_1 = NA_real_, t4se_2 = NA_real_, t4se_3 = NA_real_, 
    t4se_4 = NA_real_, t4se_6 = NA_real_, t4se_7 = NA_real_, 
    t4se_8 = NA_real_, t4se_9 = NA_real_, t4pofit_1 = NA_real_, 
    t4pofit_2 = NA_real_, t4pofit_4 = NA_real_, t4nsfit_1 = NA_real_, 
    t4nsfit_2 = NA_real_, t4nsfit_4 = NA_real_, t4dafit_1 = NA_real_, 
    t4dafit_2 = NA_real_, t4dafit_4 = NA_real_, t4jobsa_1 = NA_real_, 
    t4jobsa_2 = NA_real_, t4jobsa_3 = NA_real_, t4mean_1 = NA_real_, 
    t4mean_2 = NA_real_, t4mean_3 = NA_real_, t4mean_4 = NA_real_, 
    t4mean_5 = NA_real_, t4mean_7 = NA_real_, t4angra = NA_real_, 
    t4anful = NA_real_, t4anpar = NA_real_, t4ansel = NA_real_, 
    t4anint = NA_real_, t4anune = NA_real_, t4heal1 = NA_real_), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"))

最后,我稍微研究了一下 map 函数,最后得出了这个结论:
首先,我只选择要计算复合和 cronbach 的 alpha 的列。

library(tidyverse)
library(multicon)
library(psych)

comp_dat <- mplus_dat %>%
  select(matches("t.pa_|t.na_|t.pp_|t.se_|t.ogoal_|t.fpa_|t.fpr_|t.ffu_|t.cpl_|t.search_|t.wor_|
                  t.scom_|t.gaemp_|t.gaemn_|t.jaemp_|t.jaemn_|t.chjf_|t.hajf_|t.chjs_|t.hajs|
                  t.empse_|t.se_|t.pofit_|t.nsfit_|t.dafit_|t.jobsa_|t.mean_"))

我使用 split.default() 按列名的一部分进行拆分:

comp_split <- comp_dat %>%
  split.default(sub("_.*", "", names(comp_dat))) 

最后,我应用 map 来获得复合和 cronbach 的 alpha:

comp <- map(comp_split, ~ multicon::composite(.x, nomiss = 0.8), data = .x)
alph <- map(comp_split, ~ psych::alpha(.x), data = .x) %>%
  map(~ .x$total)

comp_df <- do.call("cbind", comp)
alph_df <- do.call("rbind", alph)
comp_df
alph_df

我最终得到了两个不错的 df,其中包含我想要的信息。


推荐阅读