r - 按分解组在另一列中重复匹配值
问题描述
我有一个数据框examp
,其中有一列A
包含日期或单词的字符串。它们要么以 excel 格式开头,要么以格式4
开头YYYY-MM-DD
。使用我的分解id
列,我想创建一个名为的新列Date
,它查看列A
并仅将该列中的日期字符串转换为实际日期。该列应为每个因子级别Date
重复匹配的日期值。id
这是我的数据示例:
structure(list(id = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), A = c("2015-04-30", NA,
NA, NA, "43220", NA, NA, NA, "2014-04-30", NA, NA, NA, "2008-05-30",
NA, NA, NA, "2012-03-31", NA, NA, NA), val1 = c(12.5846281409471,
10.4624600058284, 11.4755754286143, 14.701688181697, 13.8405475026281,
13.004572814877, 7.72203471371193, 13.7606513736878, 12.4136524221556,
10.6791971268818, 12.3554296419778, 11.2009080049977, 13.1858720282748,
12.4583870312518, 14.1838110701387, 12.5430358199843, 12.2190413098143,
14.096471309436, 12.360573242374, 12.6067718045824), val2 = c(34.8949657555743,
31.1540833974875, 31.0088433621199, 43.0588053956557, 33.1324369995825,
37.4651979318779, 39.7380042954774, 43.6837424041797, 39.2399727601316,
30.1360902228072, 30.2985229201897, 32.9737294189259, 29.6223913098321,
38.9078239846969, 45.7976491432947, 28.6114892141713, 25.862618117394,
35.9858613120554, 34.8668513118929, 43.6680483315073)), row.names = c(NA,
-20L), class = "data.frame")
这些是 A 列中的所有值:
c(NA, "42674", "42766", "43131", "43404", "43039", "42704", "43434",
"43069", "42735", "43100", "42794", "43159", "43190", "42825",
"43220", "42855", "43251", "42886", "43281", "42916", "43312",
"42947", "43343", "42978", "43373", "43008", "43585", "43951",
"43708", "43465", "43830", "43524", "43890", "43496", "43861",
"43677", "43646", "43555", "43921", "43616", "43982", "43799",
"43769", "43738", "42643", "2011-07-31", "2012-02-29", "2008-04-30",
"2012-04-30", "2013-04-30", "2015-04-30", "42490", "2009-04-30",
"2010-04-30", "2011-04-30", "2014-04-30", "2012-08-31", "2013-08-31",
"2015-08-31", "42613", "2008-08-31", "2009-08-31", "2010-08-31",
"2011-08-31", "2014-08-31", "2008-12-31", "2012-12-31", "2013-12-31",
"2015-12-31", "2009-12-31", "2010-12-31", "2011-12-31", "2014-12-31",
"2013-02-28", "2015-02-28", "2010-02-28", "2011-02-28", "2014-02-28",
"2016-02-29", "2013-01-31", "2014-01-31", "2016-01-31", "2009-01-31",
"2010-01-31", "2011-01-31", "2015-01-31", "2012-01-31", "2012-07-31",
"2013-09-30", "2015-07-31", "42582", "2008-07-31", "2009-07-31",
"2010-07-31", "2014-07-31", "2012-06-30", "2013-06-30", "2015-06-30",
"42551", "2008-06-30", "2009-06-30", "2010-06-30", "2011-06-30",
"2014-06-30", "2012-03-31", "2013-03-31", "2015-03-31", "42460",
"2009-03-31", "2010-03-31", "2011-03-31", "2012-05-31", "2013-05-31",
"2015-05-31", "42521", "2010-05-31", "2011-05-31", "2014-05-31",
"2008-11-30", "2012-11-30", "2013-11-30", "2015-11-30", "2009-11-30",
"2010-11-30", "2011-11-30", "2014-11-30", "2008-10-31", "2012-10-31",
"2013-10-31", "2015-10-31", "2009-10-31", "2010-10-31", "2014-10-31",
"2012-09-30", "2015-09-30", "2008-09-30", "2010-09-30", "2011-09-30",
"2014-09-30", "2009-02-28", "2008-05-30")
解决方案
根据数据中的模式,我们可以更改日期并用于tidyr::fill
填充NA
基于 的值id
。
library(dplyr)
df %>%
mutate(date = if_else(grepl('^\\d+$', A),
as.Date(as.numeric(A), origin = "1899-12-30"), as.Date(A))) %>%
group_by(id) %>%
tidyr::fill(date)
# id A val1 val2 date
# <fct> <chr> <dbl> <dbl> <date>
# 1 1 2015-04-30 12.6 34.9 2015-04-30
# 2 1 NA 10.5 31.2 2015-04-30
# 3 1 NA 11.5 31.0 2015-04-30
# 4 1 NA 14.7 43.1 2015-04-30
# 5 2 43220 13.8 33.1 2018-04-30
# 6 2 NA 13.0 37.5 2018-04-30
# 7 2 NA 7.72 39.7 2018-04-30
# 8 2 NA 13.8 43.7 2018-04-30
# 9 3 2014-04-30 12.4 39.2 2014-04-30
#10 3 NA 10.7 30.1 2014-04-30
#11 3 NA 12.4 30.3 2014-04-30
#12 3 NA 11.2 33.0 2014-04-30
#13 4 2008-05-30 13.2 29.6 2008-05-30
#14 4 NA 12.5 38.9 2008-05-30
#15 4 NA 14.2 45.8 2008-05-30
#16 4 NA 12.5 28.6 2008-05-30
#17 5 2012-03-31 12.2 25.9 2012-03-31
#18 5 NA 14.1 36.0 2012-03-31
#19 5 NA 12.4 34.9 2012-03-31
#20 5 NA 12.6 43.7 2012-03-31
请注意,这将产生一个警告,因为我们正在使用as.numeric
类似字符串,2015-04-30
但忽略它是安全的。我们也可以使用suppressWarnings()
来压制它们。
我们还可以将其分解为步骤以避免警告和错误:
df$date <- as.Date(NA)
x1 <- grepl('^\\d+$', df$A)
x2 <- grepl('\\d+-\\d+-\\d+', df$A)
df$date[x1] <- as.Date(as.numeric(df$A[x1]), origin = "1899-12-30")
df$date[x2] <- as.Date(df$A[x2])
df %>% group_by(id) %>% fill(date, .direction = 'updown')
推荐阅读
- vb.net - VB.Net - 序列化列表属性
- c - 尝试在 AIX 中动态加载 Oracle 客户端库时出错
- azure - 来自大查询和 Azure 数据工厂管道的数据映射问题
- javascript - 从禁用状态重新启用时,如何让 Chrome 扩展程序触发功能?
- javascript - TypeScript - 使用装饰器向类添加方法
- hadoop - Hadoop:HDFS 空间配额
- servicestack - ServiceStack 试图创建我自己的 OpenIdOAuthProvider 但 VS 2017 说缺少程序集 5.0.0.0
- java - 如何使用java从网站中提取流URL?
- javascript - 无法在使用 testcafe 的反应组件“类型搜索下拉列表”中输入文本
- android - 启动时仅启动前台服务,但不启动主要活动