r - 绘制响应率随时间的演变
问题描述
我试图绘制三个组的响应率如何在一段时间内演变。目标是有一个图表来显示每天的响应率是如何变化的。第一天我们有 5%,第二天 6%,8% 等等
我为此拥有的变量是:(1)称为 startday 的时间变量,其中每个观察都分配了一个与他们开始调查的日期相对应的日期值 - 相反,非受访者分配了一个 NA 值(我相信问题可能来自这里);(2) 变量无响应,如果观察被分配给 1,如果他们没有响应,如果他们参与,则分配给 0;(3) 将观察分配给三个组之一的变量。
我已经尝试过了,这对于绘制其他变量随时间的分布非常有用:
df %>%
mutate(date = lubridate::mdy(startday)) %>%
arrange(date) %>%
mutate(Rs = cumsum(`Non-respondent` %in% c(0, 1)),
response_Rs = cumsum(sf_sex == 0)) %>%
group_by(date, sf_group) %>%
slice(n()) %>%
select(date, Rs, response_Rs, sf_group) %>%
mutate(response_prop = response_Rs/Rs) %>%
ggplot(aes(x = date, y = response_prop, group = sf_group, colour = sf_group)) +
geom_point() +
geom_line()
问题来自这样一个事实,即我必须在 startday 中使用 NA 的观察结果,因为所有这些以及其他都是 100% 的样本,并且还必须考虑到它们来计算响应率。
有人知道出了什么问题或以另一种方式来计算吗?
这是我的数据框的一部分:非常感谢您的帮助和您的时间。
df <-structure(list(startday = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "07/02/2019", "05/26/2019", "05/20/2019",
"06/10/2019", "05/24/2019", NA, NA, NA, NA, "05/25/2019", NA,
NA, "05/20/2019", "05/20/2019", NA, NA, NA, "05/20/2019", "06/12/2019",
NA, NA, NA, "05/30/2019", "06/10/2019", NA, "06/04/2019", "06/03/2019",
NA, NA, NA, NA, "06/30/2019", "06/11/2019", NA, NA, NA, NA, "05/23/2019",
NA, NA, NA, "05/23/2019", "05/27/2019", "06/17/2019", "05/21/2019",
"06/02/2019", NA, NA, NA, NA, NA, NA, NA, "05/28/2019", NA, NA,
"05/29/2019", "06/03/2019", NA, NA, NA, NA, "05/20/2019", "06/16/2019",
NA, NA, "07/08/2019", "06/16/2019", "06/24/2019", NA, NA, "05/20/2019",
"05/21/2019", NA, "05/24/2019", "05/20/2019", NA, "06/14/2019",
NA, NA, "05/20/2019", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "06/04/2019", NA, NA, "06/18/2019", NA, NA, NA, NA, NA, "06/15/2019",
NA, NA, "05/28/2019", "06/24/2019", NA, NA, "05/31/2019", NA,
NA, "05/20/2019", "05/20/2019", "06/04/2019", NA, NA, "06/10/2019",
NA, NA, "06/24/2019", NA, NA, NA, NA, NA, "05/21/2019", NA, "06/14/2019",
NA, "05/27/2019", "06/03/2019", NA, "05/27/2019", "05/21/2019",
NA, "06/14/2019", "05/23/2019", "06/04/2019", NA, NA, "06/14/2019",
NA, NA, NA, NA, "06/04/2019", "05/31/2019", "05/20/2019", "06/03/2019",
NA, NA, NA, NA, NA, NA, "06/19/2019", NA, "06/03/2019", NA, "05/21/2019",
"05/24/2019", NA, "06/05/2019", "07/08/2019", NA, NA, "06/16/2019",
"06/03/2019", NA, NA, "06/05/2019", "05/20/2019", "06/02/2019",
NA, NA, NA, NA, "06/10/2019", NA, NA, NA, NA, NA, NA), `Non-respondent` = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1), sf_group = structure(c(3L, 3L,
2L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 1L,
1L, 3L, 1L, 2L, 2L, 2L, 3L, 1L, 2L, 2L, 1L, 1L, 3L, 1L, 1L, 1L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 3L,
1L, 1L, 2L, 1L, 1L, 1L, 3L, 1L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 2L,
1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 3L, 1L, 1L, 1L,
1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 2L,
1L, 1L, 1L, 3L, 1L, 1L), .Label = c("No group", "Groupe 2", "Groupe 1"
), class = "factor")), row.names = c("8011", "1371", "873", "1977",
"528", "18919", "8722", "4633", "1915", "15499", "1357", "1477",
"1163", "1735", "20367", "15951", "17846", "6448", "6744", "86",
"3022", "15932", "1536", "16212", "1032", "13751", "12641", "937",
"570", "25532", "12241", "15002", "21864", "5005", "326", "22342",
"11063", "24744", "13240", "11592", "21291", "525", "18272",
"1947", "10319", "1561", "9809", "5181", "6811", "6465", "440",
"6737", "10780", "13631", "13503", "1884", "17231", "8941", "633",
"8624", "8378", "77", "134", "17033", "12024", "10251", "19967",
"6102", "2007", "13108", "8254", "11576", "382", "18078", "8542",
"1889", "18932", "3093", "8693", "21749", "9271", "1240", "63",
"20334", "2907", "15081", "9867", "17560", "14360", "965", "5082",
"20037", "5622", "24726", "998", "18748", "1269", "1577", "980",
"84", "11064", "14970", "20121", "11046", "1738", "25514", "1876",
"1036", "8711", "7266", "4320", "10040", "9780", "5503", "3247",
"2055", "17981", "2272", "4479", "23322", "6361", "623", "22411",
"9620", "20939", "20002", "6493", "19896", "21975", "20692",
"17702", "23389", "278", "15974", "19556", "603", "744", "15689",
"12927", "1724", "13334", "6966", "986", "12082", "22650", "266",
"16610", "6475", "39", "17807", "19787", "193", "6544", "15671",
"4101", "1658", "2471", "2487", "723", "10298", "15988", "18664",
"6571", "16745", "11901", "4639", "3981", "9126", "24546", "20912",
"3362", "674", "1823", "7887", "767", "13939", "20856", "10735",
"10176", "377", "2037", "4718", "11495", "18804", "4617", "12646",
"19256", "7050", "2566", "13527", "2141", "1607", "301", "422",
"5560", "5868", "1401", "1995", "5782", "9172"), class = "data.frame")
解决方案
我觉得你把生活弄得太复杂了。由于所有缺失的行startday
也Non-respondent
等于1
,因此最简单的方法是按降序Responder
(以避免令人困惑的双重否定和尴尬的反引号)和升序对数据集进行排序。那么在任何给定日期响应者的比例只是行数除以数据框中的总行数。
library(lubridate)
library(tidyverse)
df %>%
mutate(
StartDate=mdy(startday),
Responder=!`Non-respondent`
) %>%
arrange(desc(Responder), StartDate) %>%
mutate(Prop=row_number()/nrow(.)) %>%
ggplot(aes(x=StartDate, y=Prop, group=sf_group, colour=sf_group)) +
geom_line() +
geom_point()
packageVersion("tidyverse")
[1] ‘1.3.1’
packageVersion("lubridate")
[1] ‘1.7.10’
修订编辑
为了回应 OP 的澄清,他们希望在 group 内计算比例,而不是整体。这很简单。请注意,分组数据不必按分组变量排序。
下面评论中讨论的混淆是一个完美的例子,说明为什么这种精心设计的问题应该至少包含一些预期输出的指示。
df %>%
mutate(
StartDate=mdy(startday),
Responder=!`Non-respondent`
) %>%
arrange(desc(Responder), StartDate) %>%
group_by(sf_group) %>%
mutate(Prop=1:length(cur_group_rows())/n()) %>%
ggplot(aes(x=StartDate, y=Prop, colour=sf_group)) +
geom_line() +
geom_point()
检查图表是否至少具有可信的属性:
df %>%
mutate(
StartDate=mdy(startday),
Responder=!`Non-respondent`
) %>%
group_by(sf_group, Responder) %>%
summarise(N=n(), .groups="drop") %>%
pivot_wider(values_from=N, names_from=Responder) %>%
mutate(Pct=`TRUE` / (`TRUE` + `FALSE`))
# A tibble: 3 x 4
sf_group `FALSE` `TRUE` Pct
<fct> <int> <int> <dbl>
1 No group 86 52 0.377
2 Groupe 2 18 13 0.419
3 Groupe 1 23 8 0.258
推荐阅读
- android - 无法启动模拟器:错误:模拟器未在 60 秒内连接
- python - Sotfmax 的导数是否有任何适当的 numpy 函数?
- python - 如何修复 numpy python 文件加载/写入中的逻辑错误
- types - 使新的类型定义与所有模块兼容
- c++ - 无法使用 peek() 解析文本文件中的最后一个单词
- php - 我如何在 php 中为字符串写这个并且不给我错误
- r - 如何将 1000 个大小为 25 的随机样本的代码更新为一个大小为 25 的随机样本的 1000 个重采样
- php - 如何在 Laravel 中删除外键约束?
- python - 在服务器中运行的数据库是否可以拒绝连接,并且一旦修复,服务器本身会拒绝它?
- angularjs - 如何将控制器和工厂放在单独的文件中