r - R:将 PCA 分数转换为百分位数和载荷符号
问题描述
我使用 prcomp 在包含 24 个健康变量和其他社会经济变量的数据集上执行 PCA。目的是获得一个可用于回归分析的单一健康指数。我使用以下代码:
total_pca <- prcomp(health[,-1], scale. = FALSE, rank. = 1)
data$pca <- total_pca$x
data$PVW <- ecdf(-data$pca)(-data$pca) # Convert into percentile
从一篇文章中它指出:“所有负载都是正数,这意味着第一主成分的较大值表示健康状况较差。然后将第一主成分转换为各个百分位数,以便更高的值反映更好的健康状况。因此,我们可以解释估计的健康参数是由于健康指数增加百分位而导致的工作概率的变化。”
我的问题是我得到了正面和负面的加载,所以我仍然可以使用与文章中相同的解释吗?或者我应该颠倒负数的符号?如果是这样,我该怎么做?
另一个问题是我的数据中有不同的波,但是我在某处读到,我不能将我的数据子集到给定波的不同数据集中,并为每个波做单独的 PCA - 这是正确的吗?如果是这样,那我该怎么做呢?
我的数据的一个可重现的例子是:
health <- structure(list(wave = c(1, 2, 4, 5, 1, 5, 5, 4, 4, 1, 1, 1, 4, 2, 4, 2, 4, 6, 2, 4, 5, 1, 4, 1, 1, 2, 1, 2, 5, 2, 2, 4, 2, 1, 4, 4, 4, 1, 4, 2), fairpoor = c(1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0), adl = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), mental = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0), heart = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), blood = c(1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), stroke = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), diabetes = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lung = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0), arthritis = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1), cancer = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), backjoint = c(1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0), doctor = c(1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), hospital = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), nursinghome = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bmi = structure(c(32.1, 28.7, 24.7, 23.5, 25.1, 26.3, 22.8, 26.3, 17.2, 32.2, 21.2, 23.6, 28.3, 35.8, 28.3, 28.7, 28.1, 20.4, 23.7, 22.7, 20.4, 25.5, 29.7, 20.3, 20.8, 23.1, 23.3, 26.3, 34.2, 40.6, 24.9, 27.2, 26.4, 23.5, 32.1, 32.8, 26.0, 23.4, 23.7, 22.8), labels = structure(c(-3, -2, -1), .Names = c("Implausible/ suspected wrong", "Refusal", "Don't know")), class = "labelled"), walking = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), sitting = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), chair = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), stairs = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), kneeling = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), arm = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0), pullpush = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lifting = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), coin = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("wave", "fairpoor", "adl", "mental", "heart", "blood", "stroke", "diabetes", "lung", "arthritis", "cancer", "backjoint", "doctor", "hospital", "nursinghome", "bmi", "walking", "sitting", "chair", "stairs", "kneeling", "arm", "pullpush", "lifting", "coin"), row.names = c(323L, 1847L, 3731L, 5973L, 244L, 5914L, 6289L, 3847L, 3804L, 75L, 247L, 212L, 3878L, 1858L, 3994L, 2046L, 3920L, 9459L, 1850L, 4000L, 6072L, 253L, 3826L, 148L, 319L, 1855L, 17L, 1849L, 5683L, 1791L, 2002L, 3744L, 2027L, 219L, 4052L, 3837L, 4008L, 127L, 3906L, 1880L), class = "data.frame")
解决方案
双标图可以更好地解释您的健康数据中的PCA。
在您给定的数据bmi
中,其幅度值高于所有其他变量,因此对于相同百分比变化的数据变化贡献更大。
因此,始终建议进行缩放,以使每个变量都具有同等重要性。此外,您的示例数据似乎具有固定列(所有个人均为 0),因此从分析中删除。
non_zero_counts <- apply(health, 2, function(x) sum(x != 0))
all_zero_col <- which(non_zero_counts == 0)
# removing columns with all zeroes from PCA
total_pca <- prcomp(health[,-c(1,all_zero_col)], scale. = TRUE)
主成分系数让您了解主成分如何与分析中的原始变量相关联。
library(ggbiplot)
ggbiplot(total_pca) + xlim(-2,2) + ylim(-2,2)
如您所见,几乎所有原始变量(糖尿病、肺除外)都与 PC1 负相关。如果高分条件/原始变量表明健康状况不佳,则较高的 PC1 分数与健康状况相关
推荐阅读
- postgresql - 错误:文本搜索配置不存在
- react-native-flatlist - 即使数据数组是干净的,React 本机 Flatlist 数据也不会清除
- javascript - 无法自定义表单错误,因为服务器返回 422 Unprocessable Entity 而没有返回错误
- git - 应该相等的合并git分支实际上有很多差异
- visual-studio - 启动时的 Visual Studio 2017 问题
- jquery - 重叠的文本过渡
- android - NoClassDefFound 异常解决方案
- c++ - 在部分txt文档中查找信息并将它们存储在变量中
- javascript - 按对象内部的值对对象集合进行排序
- google-chrome - 为什么 Chrome 在反复刷新使用 WebAssembly 的页面后最终会抛出“Out of memory: wasm memory”?