首页 > 解决方案 > 试图扩大数据框,同时通过它们的因素交互两列

问题描述

所以这是我的可重现示例数据框,我有一个年份列和一个癌症列,两者都是因素,我试图在与它们交互的同时扩大数据框。解释起来有点复杂,所以我展示了原始数据框和最终数据框。基本上,我正在尝试按年份划分个体癌症记录。

理想情况下,这将使用管道完成,因为这是我更熟悉的内容,并且可以调整我的实际数据框,即 10 年水平和 15 个癌症水平,以及我想给出的第二列中的 10 个水平一样对待。

YR<-as.factor( c(2019,2018,2019,2019,2018,2018,2019,2019,2018))
STATE<-as.factor( c("CA","MA","KY","KY","CA","MA","KY","KY","CA"))
COUNTY<-as.factor( c("C1","M1","K1","K2","C1","M2","K1","K2","C1"))
CANCER<-as.factor(c("Cervical","Lung","Prostate","Breast","Cervical","Breast","Prostate","Prostate","Lung"))
rand_fact<-as.factor(c("rf1","rf2","rf3","fr4","fr5","rf2","rf3","fr4","fr5"))
rand_num<-as.numeric(c(4,3,5,7,3,5,3,24,9))
rand_chr<-as.character(c("a","d","r","e","g","y","r","e","k"))
TEST_DR<-data.frame(YR,STATE,COUNTY,CANCER,rand_fact,rand_num,rand_chr)
rm(YR,STATE,COUNTY,CANCER,rand_chr,rand_num,rand_fact)
TEST_DR<-arrange(.data = TEST_DR,YR,COUNTY)

> print(TEST_DR)
  YR STATE COUNTY   CANCER rand_fact rand_num rand_chr
2018    CA     C1 Cervical       fr5        3        g
2018    CA     C1     Lung       fr5        9        k
2018    MA     M1     Lung       rf2        3        d
2018    MA     M2   Breast       rf2        5        y
2019    CA     C1 Cervical       rf1        4        a
2019    KY     K1 Prostate       rf3        5        r
2019    KY     K1 Prostate       rf3        3        r
2019    KY     K2   Breast       fr4        7        e
2019    KY     K2 Prostate       fr4       24        e

因此,查看下表,您会看到 2018 年有一组癌症列,2019 年有一组单独的列。随着我们的进展,有一些行总计。

STATE<-as.factor( c("CA","MA","MA","KY","KY"))
COUNTY<-as.factor( c("C1","M1","M2","K1","K2"))
Cervical_18<-as.numeric(c(1,0,0,0,0))
Lung_18<-as.numeric(c(1,1,0,0,0))
Prostate_18<-as.numeric(c(0,0,0,0,0))
Breast_18<-as.numeric(c(0,0,1,0,0))
Cervical_19<-as.numeric(c(1,0,0,0,0))
Lung_19<-as.numeric(c(0,0,0,0,0))
Prostate_19<-as.numeric(c(0,0,0,2,1))
Breast_19<-as.numeric(c(0,0,0,0,1))
total_C18<-as.numeric(c(2,1,1,0,0))
total_C19<-as.numeric(c(1,0,0,2,2))
total_cancer<-as.numeric(c(3,1,1,2,2))
TEST_DR3 <-data.frame(STATE,COUNTY,Cervical_18,Lung_18,Prostate_18,
                                             Breast_18,total_C18,Cervical_19,Lung_19,Prostate_19,
                                             Breast_19,total_C19,total_cancer)
rm(STATE,COUNTY,Cervical_18,Lung_18,Prostate_18,
              Breast_18,total_C18,Cervical_19,Lung_19,Prostate_19,
              Breast_19,total_C19,total_cancer)

> print(TEST_DR3)
   STATE COUNTY Cervical_18 Lung_18 Prostate_18 Breast_18 total_C18 Cervical_19  Lung_19  Prostate_19  Breast_19  total_C19  total_cancer
1    CA     C1           1       1           0         0         2           1        0            0          0          1             3
2    MA     M1           0       1           0         0         1           0        0            0          0          0             1
3    MA     M2           0       0           0         1         1           0        0            0          0          0             1
4    KY     K1           0       0           0         0         0           0        0            2          0          2             2
5    KY     K2           0       0           0         0         0           0        0            1          1          2             2

标签: rdataframedplyrpipetidyr

解决方案


这是一个tidyverse解决方案

library(tidyr)
library(dplyr)

TEST_DR %>%                                # v-----------set this to "value" so that later we don't need to specify the column name for the argument "values_from"
  count(YR, STATE, COUNTY, CANCER, name = "value") %>%
  rows_insert(
    (.) %>% 
      group_by(STATE, COUNTY, YR) %>% 
      summarise(CANCER = "total", value = sum(value)),
    by = c("STATE", "COUNTY", "YR", "CANCER")
  ) %>% 
  arrange(YR, factor(CANCER, unique(CANCER))) %>% 
  pivot_wider(
    id_cols = c(STATE, COUNTY), 
    names_from = c(CANCER, YR), 
    values_fn = sum, 
    values_fill = 0L
  ) %>% 
  mutate(total_cancer = rowSums(across(starts_with("total"))))

输出

`summarise()` regrouping output by 'STATE', 'COUNTY' (override with `.groups` argument)
# A tibble: 5 x 11
  STATE COUNTY Cervical_2018 Lung_2018 Breast_2018 total_2018 Cervical_2019 Breast_2019 Prostate_2019 total_2019 total_cancer
  <fct> <fct>          <int>     <int>       <int>      <int>         <int>       <int>         <int>      <int>        <dbl>
1 CA    C1                 1         1           0          2             1           0             0          1            3
2 MA    M1                 0         1           0          1             0           0             0          0            1
3 MA    M2                 0         0           1          1             0           0             0          0            1
4 KY    K2                 0         0           0          0             0           1             1          2            2
5 KY    K1                 0         0           0          0             0           0             2          2            2

或使用reshape2::dcast

library(reshape2)
library(dplyr)

TEST_DR %>% 
  count(YR, STATE, COUNTY, CANCER, name = "value") %>% 
  rows_insert(
    (.) %>% 
      group_by(STATE, COUNTY, YR) %>% 
      summarise(CANCER = "total", value = sum(value)),
    by = c("STATE", "COUNTY", "YR", "CANCER")
  ) %>% 
  arrange(YR, factor(CANCER, unique(CANCER))) %>% 
  dcast(STATE + COUNTY ~ CANCER + YR, sum) %>% 
  mutate(total_cancer = rowSums(across(starts_with("total"))))

推荐阅读