r - 将excel中的信息提取到R中的列表中
问题描述
大家好,我有这个数据集:
> dput(test1)
structure(list(startdate = c("2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-01", "2019-11-05", "2019-11-15",
"2019-11-16", "2019-11-17", "2019-11-18", "2019-11-19", "2019-11-20",
"2019-11-21", NA), id = c("POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", "POL69", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", NA), m0_9 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98,
33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), m10_19 = c(NA,
NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65,
3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), m20_29 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA), m30_39 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), m40_49 = c(32, 34, NA, NA,
NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), m50_59 = c(NA,
NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA,
7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), m60_69 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9,
1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), m70 = c(NA, NA, NA, NA, NA, NA, 32,
34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), f0_9 = c(32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), f10_19 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55,
3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f20_29 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA), f30_39 = c(NA, NA, NA, 32, 34, NA, NA, NA,
NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), f40_49 = c(NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA), f50_59 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f60_69 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), f70 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
我想创建一个名为ageCat
. 该列表应包含多个列表。列表的数量是年龄类别的数量。然后对于每个年龄类别,我想提取以下信息startAge, endAge, maleCount,femaleCount, totalCount
。
另外,我只想总结具有相同 ID 和开始日期的个人。现在我写了这个:
创建年龄列表
createLists <- function(startdate, id){
testFiltered = test1[policyid == id & start == startdate]
ageGroup <- vector("list", length == 8)
names(ageGroup) <- as.character(seq_along(ageGroup))
for(ageCat in seq_along(ageGroup)){
ageGroup[[ageCat]] <- getAgeInfo(testFiltered, ageCat)
}
getAgeInfo <- function(testFiltered, ageCat){
start =
end =
nomales =
nofemales =
}
ageGroup <- list(startAge = start,
endAge = end ,
maleCount = nomales ,
femaleCount = nofemales)
}
我已经硬编码了 vecor 的长度ageGroup
。我怎么能在没有硬编码的情况下做到这一点,又名。查找每个性别我有多少个年龄类别的列?
其次,我如何提取信息startAge, endAge, maleCount,femaleCount, totalCount
解决方案
我建议不要使用列表,而是将您的 data.frame 转换为长格式,消除缺失值并提取性别和年龄。'tidyverse' 方法可能如下所示:
library(dplyr)
library(tidyr)
library(tibble)
df <- tibble(
startdate = c(
"2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06", "2019-11-06",
"2019-11-06", "2019-11-06", "2019-11-06", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27", "2019-11-27",
"2019-11-27", "2019-11-27", "2019-11-01", "2019-11-05", "2019-11-15",
"2019-11-16", "2019-11-17", "2019-11-18", "2019-11-19", "2019-11-20",
"2019-11-21", NA
),
id = c(
"POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", "POL69", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL55", "POL56", "POL57", "POL58",
"POL59", "POL60", "POL61", "POL62", "POL63", "POL64", "POL65",
"POL66", "POL67", "POL68", NA
),
m0_9 = c(
NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98,
33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m10_19 = c(
NA,
NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65,
3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m20_29 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA
),
m30_39 = c(
NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m40_49 = c(
32, 34, NA, NA,
NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
),
m50_59 = c(
NA,
NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA,
7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), m60_69 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9,
1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA
), m70 = c(
NA, NA, NA, NA, NA, NA, 32,
34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f0_9 = c(
32, 34, NA,
NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f10_19 = c(
NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA, 55,
3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f20_29 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA
), f30_39 = c(
NA, NA, NA, 32, 34, NA, NA, NA,
NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA
), f40_49 = c(
NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA
), f50_59 = c(
NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), f60_69 = c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, 34, NA, NA, NA, NA,
55, 3, NA, NA, NA, 7, 9, 1, 65, 3, 98, 33, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
), f70 = c(
NA, NA, NA, NA, NA, NA, NA, NA,
NA, 32, 34, NA, NA, NA, NA, 55, 3, NA, NA, NA, 7, 9, 1, 65, 3,
98, 33, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA
)
)
# Convert to tidy data frame
df_age <- df %>%
gather(age_sex, count, -startdate, -id) %>%
filter(!is.na(count)) %>%
extract(age_sex, into = c("sex", "start_age", "end_age"), regex = "(m|f)(\\d+)_?(\\d+)?", remove = FALSE) %>%
mutate(ageg = paste0(start_age, "_", end_age))
df_age
#> # A tibble: 187 x 8
#> startdate id age_sex sex start_age end_age count ageg
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
#> 1 2019-11-27 POL55 m0_9 m 0 9 32 0_9
#> 2 2019-11-27 POL56 m0_9 m 0 9 34 0_9
#> 3 2019-11-27 POL61 m0_9 m 0 9 55 0_9
#> 4 2019-11-27 POL55 m0_9 m 0 9 3 0_9
#> 5 2019-11-27 POL59 m0_9 m 0 9 7 0_9
#> 6 2019-11-27 POL60 m0_9 m 0 9 9 0_9
#> 7 2019-11-27 POL61 m0_9 m 0 9 1 0_9
#> 8 2019-11-27 POL55 m0_9 m 0 9 65 0_9
#> 9 2019-11-27 POL56 m0_9 m 0 9 3 0_9
#> 10 2019-11-27 POL57 m0_9 m 0 9 98 0_9
#> # ... with 177 more rows
# df back to nested list by startdate and ageg
df_list <- df_age %>%
# Count by startdate, ageg, start_age, end_age, sex
count(startdate, ageg, start_age, end_age, sex, wt = count) %>%
# male and female counts back in columns
spread(sex, n, fill = 0) %>%
# split by startdate
split(.$startdate) %>%
# ... and split each startdate list by ageg
lapply(function(x) split(x, x$ageg))
由reprex 包于 2020-03-10 创建(v0.3.0)
推荐阅读
- node.js - Txt ajax 文件显示在网络选项卡中,但不显示文本
- jquery - 在悬停jQuery上仅更改特定的href名称
- python-3.x - Python 在使用一行代码比较来自不同数据帧的两个索引后产生 True False 输出
- react-native - 反应导航不渲染屏幕
- javascript - 重新发布价值成
- list - Flutter:在 null 上调用了“add”方法
- c - C 引用结构成员而不初始化值
- javascript - 使用 patchValue 对模板驱动的表单进行初始化
- jquery - jquery 或 CSS 的函数可以动态地将图像和文本调整为浏览器的宽度和/或高度并保持纵横比?
- javascript - 如何从firestore获取两个查询?