r - 试图扩大数据框,同时通过它们的因素交互两列
问题描述
所以这是我的可重现示例数据框,我有一个年份列和一个癌症列,两者都是因素,我试图在与它们交互的同时扩大数据框。解释起来有点复杂,所以我展示了原始数据框和最终数据框。基本上,我正在尝试按年份划分个体癌症记录。
理想情况下,这将使用管道完成,因为这是我更熟悉的内容,并且可以调整我的实际数据框,即 10 年水平和 15 个癌症水平,以及我想给出的第二列中的 10 个水平一样对待。
YR<-as.factor( c(2019,2018,2019,2019,2018,2018,2019,2019,2018))
STATE<-as.factor( c("CA","MA","KY","KY","CA","MA","KY","KY","CA"))
COUNTY<-as.factor( c("C1","M1","K1","K2","C1","M2","K1","K2","C1"))
CANCER<-as.factor(c("Cervical","Lung","Prostate","Breast","Cervical","Breast","Prostate","Prostate","Lung"))
rand_fact<-as.factor(c("rf1","rf2","rf3","fr4","fr5","rf2","rf3","fr4","fr5"))
rand_num<-as.numeric(c(4,3,5,7,3,5,3,24,9))
rand_chr<-as.character(c("a","d","r","e","g","y","r","e","k"))
TEST_DR<-data.frame(YR,STATE,COUNTY,CANCER,rand_fact,rand_num,rand_chr)
rm(YR,STATE,COUNTY,CANCER,rand_chr,rand_num,rand_fact)
TEST_DR<-arrange(.data = TEST_DR,YR,COUNTY)
> print(TEST_DR)
YR STATE COUNTY CANCER rand_fact rand_num rand_chr
2018 CA C1 Cervical fr5 3 g
2018 CA C1 Lung fr5 9 k
2018 MA M1 Lung rf2 3 d
2018 MA M2 Breast rf2 5 y
2019 CA C1 Cervical rf1 4 a
2019 KY K1 Prostate rf3 5 r
2019 KY K1 Prostate rf3 3 r
2019 KY K2 Breast fr4 7 e
2019 KY K2 Prostate fr4 24 e
因此,查看下表,您会看到 2018 年有一组癌症列,2019 年有一组单独的列。随着我们的进展,有一些行总计。
STATE<-as.factor( c("CA","MA","MA","KY","KY"))
COUNTY<-as.factor( c("C1","M1","M2","K1","K2"))
Cervical_18<-as.numeric(c(1,0,0,0,0))
Lung_18<-as.numeric(c(1,1,0,0,0))
Prostate_18<-as.numeric(c(0,0,0,0,0))
Breast_18<-as.numeric(c(0,0,1,0,0))
Cervical_19<-as.numeric(c(1,0,0,0,0))
Lung_19<-as.numeric(c(0,0,0,0,0))
Prostate_19<-as.numeric(c(0,0,0,2,1))
Breast_19<-as.numeric(c(0,0,0,0,1))
total_C18<-as.numeric(c(2,1,1,0,0))
total_C19<-as.numeric(c(1,0,0,2,2))
total_cancer<-as.numeric(c(3,1,1,2,2))
TEST_DR3 <-data.frame(STATE,COUNTY,Cervical_18,Lung_18,Prostate_18,
Breast_18,total_C18,Cervical_19,Lung_19,Prostate_19,
Breast_19,total_C19,total_cancer)
rm(STATE,COUNTY,Cervical_18,Lung_18,Prostate_18,
Breast_18,total_C18,Cervical_19,Lung_19,Prostate_19,
Breast_19,total_C19,total_cancer)
> print(TEST_DR3)
STATE COUNTY Cervical_18 Lung_18 Prostate_18 Breast_18 total_C18 Cervical_19 Lung_19 Prostate_19 Breast_19 total_C19 total_cancer
1 CA C1 1 1 0 0 2 1 0 0 0 1 3
2 MA M1 0 1 0 0 1 0 0 0 0 0 1
3 MA M2 0 0 0 1 1 0 0 0 0 0 1
4 KY K1 0 0 0 0 0 0 0 2 0 2 2
5 KY K2 0 0 0 0 0 0 0 1 1 2 2
解决方案
这是一个tidyverse
解决方案
library(tidyr)
library(dplyr)
TEST_DR %>% # v-----------set this to "value" so that later we don't need to specify the column name for the argument "values_from"
count(YR, STATE, COUNTY, CANCER, name = "value") %>%
rows_insert(
(.) %>%
group_by(STATE, COUNTY, YR) %>%
summarise(CANCER = "total", value = sum(value)),
by = c("STATE", "COUNTY", "YR", "CANCER")
) %>%
arrange(YR, factor(CANCER, unique(CANCER))) %>%
pivot_wider(
id_cols = c(STATE, COUNTY),
names_from = c(CANCER, YR),
values_fn = sum,
values_fill = 0L
) %>%
mutate(total_cancer = rowSums(across(starts_with("total"))))
输出
`summarise()` regrouping output by 'STATE', 'COUNTY' (override with `.groups` argument)
# A tibble: 5 x 11
STATE COUNTY Cervical_2018 Lung_2018 Breast_2018 total_2018 Cervical_2019 Breast_2019 Prostate_2019 total_2019 total_cancer
<fct> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <dbl>
1 CA C1 1 1 0 2 1 0 0 1 3
2 MA M1 0 1 0 1 0 0 0 0 1
3 MA M2 0 0 1 1 0 0 0 0 1
4 KY K2 0 0 0 0 0 1 1 2 2
5 KY K1 0 0 0 0 0 0 2 2 2
或使用reshape2::dcast
library(reshape2)
library(dplyr)
TEST_DR %>%
count(YR, STATE, COUNTY, CANCER, name = "value") %>%
rows_insert(
(.) %>%
group_by(STATE, COUNTY, YR) %>%
summarise(CANCER = "total", value = sum(value)),
by = c("STATE", "COUNTY", "YR", "CANCER")
) %>%
arrange(YR, factor(CANCER, unique(CANCER))) %>%
dcast(STATE + COUNTY ~ CANCER + YR, sum) %>%
mutate(total_cancer = rowSums(across(starts_with("total"))))
推荐阅读
- windows - 如何在 Windows 10 中设置来自网络的壁纸?
- c++ - 在 C++ 中正确返回和存储引用的语法
- angular - 循环中的 NgRx-Entity-Selector
- tensorflow - 尝试在 Google Colab 上训练 ResNet 时出现 ResourceExhaustedError
- c++ - 使用英特尔 C++ 编译器定义 constexpr 类型
- arrays - 如何使用全局变量数组实现 userDefaults
- ssh - SSH 连接 GCP
- python - 查找给定的输入是否幸运
- service-worker - Workbox 预热缓存问题
- sql - 使用来自同一文档的值从 JSONB 文档中的数组中选择项目