首页 > 解决方案 > 计算每月在每个地点每次停留的平均天数 - dplyr

问题描述

我希望以有效的方式计算每次住宿、每月和每个地点的平均时间。例如,1 月份在 Loc1 的平均逗留时间为 1.5 天。我有下表

Date.p1 <- c(c(timeBasedSeq("2019-01-01::2019-01-07")), c(timeBasedSeq("2019-01-01::2019-01-07")))
Character.p1 <- c(c(replicate(7, "Paul")), c(replicate(7, "Ellen")))
Location.p1 <-
  c("Loc.1", "Loc.1", "Loc.2", "Loc.2", "Loc.2", "Loc.2","Loc.1", "Loc.2", "Loc.2", "Loc.2", "Loc.3","Loc.3","Loc.2","Loc.2")

df.p1 <- as.data.frame(cbind(as.character(Date.p1), Character.p1, Location.p1))

           V1 Character.p1 Location.p1
1  2019-01-01         Paul       Loc.1
2  2019-01-02         Paul       Loc.1
3  2019-01-03         Paul       Loc.2
4  2019-01-04         Paul       Loc.2
5  2019-01-05         Paul       Loc.2
6  2019-01-06         Paul       Loc.2
7  2019-01-07         Paul       Loc.1
8  2019-01-01        Ellen       Loc.2
9  2019-01-02        Ellen       Loc.2
10 2019-01-03        Ellen       Loc.2
11 2019-01-04        Ellen       Loc.3
12 2019-01-05        Ellen       Loc.3
13 2019-01-06        Ellen       Loc.2
14 2019-01-07        Ellen       Loc.2

我希望以有效的方式计算每次住宿、每月和每个地点的平均时间。例如,1 月份在 Loc1 的平均逗留时间为 1.5 天。

我建立了一个解决方案,但它似乎不是很R-esque。

df.p1 <- unfactor(df.p1)
df.p1$V1 <- as.Date(df.p1$V1)
df.p1$Stay.id <- 0
df.p1$Month <- months(df.p1$V1)
id.num <- 1

创建一个住宿ID来隔离第二个,第三个,第n个住宿

for (i in 1:nrow(df.p1)){
      if( i == 1) {
        df.p1[i,4] <- id.num
        id.num <- id.num + 1
      } else {
        if (df.p1[i,3] == df.p1[i - 1,3]){
          df.p1[i,4] <- df.p1[i - 1,4]
        } else {
          df.p1[i,4] <- id.num
          id.num <- id.num + 1

        }
      }
    }

每次住宿计数夜

df.p2 <- df.p1 %>% dplyr::group_by(Stay.id, Month) %>%
  summarise(Stay.length = n())


Location.Stay.id.Recon <- df.p1 %>% dplyr::select(Location.p1, Stay.id)  %>% distinct()

将 Stay.ID 替换为位置

df.p2 <- merge(df.p2, Location.Stay.id.Recon, by="Stay.id") 

每个地点的平均停留时间

Avg.length <- df.p2 %>% 
  dplyr::group_by(Location.p1, Month) %>%
  dplyr::summarize(Mean = mean(Stay.length, na.rm=TRUE))

这导致:

# A tibble: 3 x 3
# Groups:   Location.p1 [3]
  Location.p1 Month    Mean
  <chr>       <chr>   <dbl>
1 Loc.1       January   1.5
2 Loc.2       January   3  
3 Loc.3       January   2  

标签: r

解决方案


这是使用 dplyr 和 lubridate 的方法:

library(dplyr); library(lubridate)
df.p1 %>%
  mutate_if(is.factor, as.character) %>%
  # Above used b/c lag function below seems to have trouble with factors
  group_by(Character.p1, month = floor_date(ymd(V1), "month")) %>%
  mutate(new_stay = if_else(Location.p1 != lag(Location.p1, default = ""), 1, 0),
         stay_num = cumsum(new_stay)) %>%
  count(Character.p1, Location.p1, month, stay_num) %>%
  group_by(month, Location.p1) %>%
  summarise(Mean = mean(n))

#  month      Location.p1  Mean
#  <date>     <chr>       <dbl>
#1 2019-01-01 Loc.1         1.5
#2 2019-01-01 Loc.2         3  
#3 2019-01-01 Loc.3         2  

推荐阅读