首页 > 解决方案 > 如何使用 Purr 或 Tidyverse 中的其他功能来重构此脚本?

问题描述

这是清理之前的数据头。它显示了澳大利亚动物的适应率。

head(df)
# A tibble: 6 x 12
   year animal_type outcome      ACT   NSW    NT   QLD    SA   TAS   VIC    WA Total
  <dbl> <chr>       <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  1999 Dogs        Reclaimed    610  3140   205  1392  2329   516  7130     1 15323
2  1999 Dogs        Rehomed     1245  7525   526  5489  1105   480  4908   137 21415
3  1999 Dogs        Other         12   745   955   860   380   168  1001     6  4127
4  1999 Dogs        Euthanized   360  9221     9  9214  1701   599  5217    18 26339
5  1999 Cats        Reclaimed    111   201    22   206   157    31   884     0  1612
6  1999 Cats        Rehomed     1442  3913   269  3901  1055   752  3768    62 15162

这是清理数据的结果。我使用了一系列 group-by 并编写了自己的函数来获取找到家的动物的百分比。

   States    Percent_Found
ACT    ACT 12.1495327102804
NSW    NSW 13.7614678899083
NT      NT 10.3338632750397
QLD    QLD 14.4157814871017
SA      SA 13.2307692307692
TAS    TAS 13.8931297709924

我试图找出 tidyverse 中是否有一种方法可以遍历列,而不必重复调用函数。

library(tidyverse)

df <- readr:: read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-21/animal_outcomes.csv')

ACT <- df %>% group_by(df$ACT) %>%  count(,outcome, outcome) %>% data.frame()
NSW <- df %>% group_by(df$NSW) %>%  count(,outcome, outcome) %>% data.frame()
NT <- df %>% group_by(df$NT) %>%  count(,outcome, outcome) %>% data.frame()
QLD <- df %>%group_by(df$QLD) %>% count(,outcome, outcome) %>% data.frame()
SA <- df %>% group_by(df$SA) %>% count(,outcome, outcome) %>% data.frame()
TAS <- df %>% group_by(df$TAS) %>% count(,outcome, outcome) %>% data.frame()
VIC <- df %>% group_by(df$VIC) %>% count(,outcome, outcome) %>% data.frame()
WA <- df %>% group_by(df$WA) %>% count(,outcome, outcome) %>% data.frame()

我听说有一个通用的工作流程,您可以在其中编写简单的函数并将其映射到数据帧。如何跨列使用此功能?

rate <- function(x)
{  
  home_found <- x %>% filter(x$outcome == "Rehomed") %>% count(,outcome)
  home_found <- home_found[2]
  home_not_found <- x %>% filter(x$outcome != "Rehomed") %>% select(n) %>% sum()
  home_not_found <- home_not_found[1]
  percent <- home_found / (home_found+ home_not_found)
  percent <- percent* 100
  return (percent)
}

ACT_val  <- rate(ACT)
NSW_val <- rate(NSW)
NT_Val <- rate(NT)
QLD_val <- rate(QLD)
SA_Val <- rate(SA)
TAS_Val <- rate(TAS)
VIC_VAL <- rate(VIC)
WA_Val <- rate(WA)


a <- rbind("ACT",ACT_val) 
b <- rbind("NSW",NSW_val) 
c <- rbind("NT", NT_Val)
d <- rbind("QLD",QLD_val)
e <- rbind("SA", SA_Val)
f <- rbind("TAS", TAS_Val)
g <- rbind("VIC", VIC_VAL)
h <- rbind("WA", WA_Val)

df <- cbind(a,b,c,d,e,f,g,h) %>% data.frame()
colnames(df) <- df[1,]
df <- df %>% t()
colnames(df) <- c("States", "Percent_Found")
df <- df %>% data.frame()

标签: rtidyverse

解决方案


我怀疑你想要这样的东西。我也正在15.06为每个领域。我也在使用dplyr_1.0.0.

library(purrr)
library(dplyr)
library(tibble)

# specify the columns you wish to iterate over
rate_cols <- c("ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA")

# i made a new rate function based on what I think you want
rate <- function(x, x_outcome = "Rehomed") {
  n_outcome <- x %>% 
    filter(outcome == x_outcome) %>% 
    pull(n)
  
  n_outcome / sum(x$n) * 100
}  

# use map to iterate over each group
set_names(rate_cols) %>% 
  map_dbl( ~ df %>%
         group_by(!!.x) %>%  # the !! operator is required to evaluate the character value
         count(outcome) %>% 
         rate()) %>% 
  enframe("States", "Percent_Found") # this converts the vector the the df as you had it

这与您发布的内容相符。

# A tibble: 8 x 2
  States Percent_Found
  <chr>          <dbl>
1 ACT             15.1
2 NSW             15.1
3 NT              15.1
4 QLD             15.1
5 SA              15.1
6 TAS             15.1
7 VIC             15.1
8 WA              15.1

我不认为这是你想要的。这给出了不同的数字,但也许这样的东西对你有用。除非我不了解您的数据集,否则我认为您想要sum,而不是count.

library(dplyr)
library(tidyr)

df %>% 
  group_by(outcome) %>% 
  summarize_at(all_of(rate_cols), sum, na.rm = TRUE) %>% 
  pivot_longer(cols = -outcome, names_to = "States") %>% 
  group_by(States) %>% 
  mutate(Percent_Total = value / sum(value)) %>% 
  filter(outcome == "Rehomed")

你可以根据需要把东西从这里拉出来。

# A tibble: 8 x 4
# Groups:   States [8]
  outcome States  value Percent_Total
  <chr>   <chr>   <dbl>         <dbl>
1 Rehomed ACT     45678         0.341
2 Rehomed NSW    194820         0.302
3 Rehomed NT      56228         0.409
4 Rehomed QLD    252229         0.294
5 Rehomed SA      62939         0.299
6 Rehomed TAS     35390         0.378
7 Rehomed VIC    201866         0.306
8 Rehomed WA      24781         0.422

推荐阅读