首页 > 解决方案 > 使用 pivot_longer() 的困难

问题描述

我正在尝试使用 pivot_longer() 做一些事情以使宽表变长,但我不太明白。

这是我要操作的数据框的头部

head(stack)
                unique.pair Area.IN Area.NEAR ALLEVEN.IN ALLEVEN.NEAR TREERICH.IN TREERICH.NEAR HEMIAB.IN HEMIAB.NEAR
1             AGFO 1_AGFO 5     100       100  0.7309552    0.3724176           2             1      1.00           0
2           AGFO 27_AGFO 24     100       100  0.8990520    0.6306221           1             0      1.00           0
3            AGFO 6_AGFO 23     100       100  0.7956735    0.7022392           1             1      1.00           0
4 ALFL LAMR.7_ALFL LAMR.103     100       400  0.4425270    0.6838157           4             6      0.50           0
5            APCO 10_APCO 2     400       400  0.5730378    0.5453876          18            19      0.55           0
6             APCO 4_APCO 9     400       400  0.6349441    0.7078960          22            23      0.55           0

基本上,每一行都是一对唯一的 2 个 ID 及其对某些指标(.IN 和.NEAR)的相应度量;我现在需要做到这一点,所以每个唯一对有两行,并且我将它们的指标分开......例如,我在为“ALLEVEN.IN 和 ALLEVEN.NEAR”做这件事上是成功的。我还需要 AREA 指标

master.long <- master.JH %>%
  select(unique.pair, ALLEVEN.IN, ALLEVEN.NEAR, HEMIAB.IN, HEMIAB.NEAR, Area.IN, Area.NEAR) %>%
  pivot_longer(cols = c(ALLEVEN.IN, ALLEVEN.NEAR), names_to = "HEMI", values_to = "ALLEVEN") %>%
  pivot_longer(cols = c(Area.IN, Area.NEAR), names_to = "Area", values_to = "Area_sampled") %>% 
  separate(HEMI, into = c(NA, "HEMI"))%>%
  separate(Area, into = c(NA , "AREA")) %>%
  mutate(HEMI.status = case_when(HEMI == "IN" & AREA == "IN" ~ "HEMI",
                                 HEMI == "NEAR" & AREA =="NEAR" ~ "NO.HEMI"))

输出是:

# A tibble: 6 x 8
  unique.pair     HEMIAB.IN HEMIAB.NEAR HEMI  ALLEVEN AREA  Area_sampled HEMI.status
  <chr>               <dbl>       <dbl> <chr>   <dbl> <chr>        <dbl> <chr>      
1 AGFO 6_AGFO 23          1           0 IN      0.796 IN             100 HEMI       
2 AGFO 6_AGFO 23          1           0 IN      0.796 NEAR           100 NA         
3 AGFO 6_AGFO 23          1           0 NEAR    0.702 IN             100 NA         
4 AGFO 6_AGFO 23          1           0 NEAR    0.702 NEAR           100 NO.HEMI    
5 AGFO 27_AGFO 24         1           0 IN      0.899 IN             100 HEMI       
6 AGFO 27_AGFO 24         1           0 IN      0.899 NEAR           100 NA   

2个问题

1.) 我明白为什么 HEMI.status 有 NA,但我不确定如何告诉代码只删除这些值。我以后可以轻松地做到这一点,但想知道是否有办法在支点更长的时间内完成

2.) 有没有办法对所有列执行此操作,所有列的一个枢轴代码更长;即,我可以将“TREERICH.IN”和“TREERICH.NEAR”合并到其中,使用相同的 HEMI 列吗?我试过了,但是当我对 TREERICH 说 "names_to" = "HEMI" 时(见下文),我得到一个明显的错误

master.long <- master.JH %>%
  select(unique.pair, ALLEVEN.IN, ALLEVEN.NEAR, HEMIAB.IN, HEMIAB.NEAR, Area.IN, Area.NEAR) %>%
  pivot_longer(cols = c(ALLEVEN.IN, ALLEVEN.NEAR), names_to = "HEMI", values_to = "ALLEVEN") %>%
pivot_longer(cols = c(TREERICH.IN, TREERICH.NEAR), names_to = "HEMI", values_to = "TREERICH")
  pivot_longer(cols = c(Area.IN, Area.NEAR), names_to = "Area", values_to = "Area_sampled") %>% 
  separate(HEMI, into = c(NA, "HEMI"))%>%
  separate(Area, into = c(NA , "AREA")) %>%
  mutate(HEMI.status = case_when(HEMI == "IN" & AREA == "IN" ~ "HEMI",
                                 HEMI == "NEAR" & AREA =="NEAR" ~ "NO.HEMI"))

希望我解释得足够好。谢谢你的帮助!

标签: rdplyr

解决方案


如果我们想从原始数据构造一个长格式,请指定names_sepas .(在 regex 模式下默认转义,.表示任何字符)并names_to使用两个组件来.value表示各个列的形式,“in_near”获取后缀部分列名,即“IN”、“NEAR”。如果有NA元素指定values_drop_na = TRUE删除那些NAs

library(dplyr)
library(tidyr)
master.JH %>% 
     pivot_longer(cols = -unique.pair, 
        names_to = c(".value", "in_near"), 
           names_sep = "\\.", values_drop_na = TRUE)

-输出

# A tibble: 12 x 6
#   unique.pair               in_near  Area ALLEVEN TREERICH HEMIAB
#   <chr>                     <chr>   <int>   <dbl>    <int>  <dbl>
# 1 AGFO 1_AGFO 5             IN        100   0.731        2   1   
# 2 AGFO 1_AGFO 5             NEAR      100   0.372        1   0   
# 3 AGFO 27_AGFO 24           IN        100   0.899        1   1   
# 4 AGFO 27_AGFO 24           NEAR      100   0.631        0   0   
# 5 AGFO 6_AGFO 23            IN        100   0.796        1   1   
# 6 AGFO 6_AGFO 23            NEAR      100   0.702        1   0   
# 7 ALFL LAMR.7_ALFL LAMR.103 IN        100   0.443        4   0.5 
# 8 ALFL LAMR.7_ALFL LAMR.103 NEAR      400   0.684        6   0   
# 9 APCO 10_APCO 2            IN        400   0.573       18   0.55
#10 APCO 10_APCO 2            NEAR      400   0.545       19   0   
#11 APCO 4_APCO 9             IN        400   0.635       22   0.55
#12 APCO 4_APCO 9             NEAR      400   0.708       23   0        

数据

master.JH <- structure(list(unique.pair = c("AGFO 1_AGFO 5", "AGFO 27_AGFO 24", 
"AGFO 6_AGFO 23", "ALFL LAMR.7_ALFL LAMR.103", "APCO 10_APCO 2", 
"APCO 4_APCO 9"), Area.IN = c(100L, 100L, 100L, 100L, 400L, 400L
), Area.NEAR = c(100L, 100L, 100L, 400L, 400L, 400L), ALLEVEN.IN = c(0.7309552, 
0.899052, 0.7956735, 0.442527, 0.5730378, 0.6349441), ALLEVEN.NEAR = c(0.3724176, 
0.6306221, 0.7022392, 0.6838157, 0.5453876, 0.707896), TREERICH.IN = c(2L, 
1L, 1L, 4L, 18L, 22L), TREERICH.NEAR = c(1L, 0L, 1L, 6L, 19L, 
23L), HEMIAB.IN = c(1, 1, 1, 0.5, 0.55, 0.55), HEMIAB.NEAR = c(0L, 
0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6"))

推荐阅读