r - 合并 data.table 返回空变量 - R
问题描述
我有两个data.tables
我正在尝试合并。但是,这些行中的这些行data.tables
需要大量变量以避免重复。由于机密性数据,我们没有标识符变量,我需要结合几个变量来匹配这两个数据集。
我试图加入他们,但是一旦我查看最终数据集,变量就是空的。所有值都设置为 NULL。data1
有 17440 个观测值和 57 个变量。old_data
有 17347 个观测值和 12 个变量。我需要 11 个变量来获得独特的观察结果,我们来命名它们key_variables
。这是我所拥有的:
key_variables <- c("sex", "birthdate", "sint", "cons", "diag", "concelho", "Serologia", "alcohol", "end", "micro")
setkeyv(data1, key_variables)
setkeyv(old_data, key_variables)
dataFinal <- merge(data1, old_data, key_variables, all.x = T)
我要添加的变量data1
是一个因素。我试图更改为字符,但我仍然将变量设置为 NULL。知道什么可能导致这个问题吗?
str(old_data)
Classes ‘data.table’ and 'data.frame': 17347 obs. of 12 variables:
$ sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "llevels")= int 1 2
..- attr(*, "label")= chr "Sex"
$ birthdate : labelled, format: NA NA ...
$ diagnosis_date : labelled, format: "2009-01-09" "2009-10-15" ...
$ county : Factor w/ 300 levels "Lisboa","Sines",..: 23 62 244 34 18 37 1 27 60 66 ...
..- attr(*, "llevels")= int 11 1 2 3 4 5 6 7 8 9 ...
..- attr(*, "label")= chr "County"
str(data)
Classes ‘data.table’ and 'data.frame': 17440 obs. of 57 variables:
$ ID : chr "12083" "12084" "12087" "12096" ...
$ sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "llevels")= int 1 2
$ birthdate : Date, format: NA NA ...
$ county : Factor w/ 300 levels "Lisboa","Sines",..: 17 17 50 235 25 84 28 1 20 47 ...
..- attr(*, "llevels")= int 10 1 2 3 4 5 6 7 8 9 ..
dput(data1)
structure(list(sex = c("Masculino", "Masculino", "Masculino"),
birthdate = c("4/23/1952", "11/26/1964", "01/08/1965"), sint = c("01/01/2014",
"09/01/2010", "01/01/2008"), cons = c("02/10/2014", "12/01/2010",
"1/29/2008"), diag = c("02/10/2014", "12/03/2010", "02/03/2008"
), concelho = c("vila velha de ródão", "vila velha de ródão",
"vila velha de ródão"), Serologia = c("Não", "Não", "Não"
), alcohol = c("Sim", "Não", "Sim"), end = c("11/03/2014",
"10/10/2011", "9/17/2008"), micro = c("03/11/2008", "12/03/2010",
"02/03/2008"), DInflamatoriaArticular = c("Não", "Não", "Não"
)), row.names = c(NA, -3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001f2af621ef0>)
dput(old_data)
structure(list(sex = c("Masculino", "Masculino", "Masculino"),
birthdate = c("23/04/1952", "26/11/1964", "08/01/1965"),
age = c(61L, 46L, 43L), concelho = c("vila velha de ródão",
"vila velha de ródão", "vila velha de ródão"), EstadoVital = c("Vivo",
"Vivo", "Vivo"), sint = c("01/01/2014", "01/09/2010", "01/01/2008"
), cons = c("10/02/2014", "01/12/2010", "29/01/2008"), alcohol = c("Sim",
"Não", "Sim"), drugs = c("Não", "Não", "Não"), micro = c("11/03/2008",
"03/12/2010", "03/02/2008"), diag = c("10/02/2014", "03/12/2010",
"03/02/2008"), Serologia = c("Não", "Não", "Não"), end = c("03/11/2014",
"10/10/2011", "17/09/2008"), Motivotermotratamento = c("Tratamento Completado",
"Tratamento Completado", "Tratamento Completado"), ano = c(2014L,
2010L, 2008L), region = c("Centro", "Centro", "Centro")), row.names = c(NA,-3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001f2af621ef0>)
解决方案
正如您在评论中已经提到的,两个表格的时间格式不同。这是对齐它们的可能性:
library(data.table)
key_variables <-
c(
"sex",
"birthdate",
"sint",
"cons",
"diag",
"concelho",
"Serologia",
"alcohol",
"end",
"micro"
)
data1 <-
structure(
list(
sex = c("Masculino", "Masculino", "Masculino"),
birthdate = c("4/23/1952", "11/26/1964", "01/08/1965"),
sint = c("01/01/2014",
"09/01/2010", "01/01/2008"),
cons = c("02/10/2014", "12/01/2010",
"1/29/2008"),
diag = c("02/10/2014", "12/03/2010", "02/03/2008"),
concelho = c("vila velha de ródão", "vila velha de ródão",
"vila velha de ródão"),
Serologia = c("Não", "Não", "Não"),
alcohol = c("Sim", "Não", "Sim"),
end = c("11/03/2014",
"10/10/2011", "9/17/2008"),
micro = c("03/11/2008", "12/03/2010",
"02/03/2008"),
DInflamatoriaArticular = c("Não", "Não", "Não")
),
row.names = c(NA,-3L),
class = c("data.table", "data.frame")
)
old_data <-
structure(
list(
sex = c("Masculino", "Masculino", "Masculino"),
birthdate = c("23/04/1952", "26/11/1964", "08/01/1965"),
age = c(61L, 46L, 43L),
concelho = c("vila velha de ródão",
"vila velha de ródão", "vila velha de ródão"),
EstadoVital = c("Vivo",
"Vivo", "Vivo"),
sint = c("01/01/2014", "01/09/2010", "01/01/2008"),
cons = c("10/02/2014", "01/12/2010", "29/01/2008"),
alcohol = c("Sim",
"Não", "Sim"),
drugs = c("Não", "Não", "Não"),
micro = c("11/03/2008",
"03/12/2010", "03/02/2008"),
diag = c("10/02/2014", "03/12/2010",
"03/02/2008"),
Serologia = c("Não", "Não", "Não"),
end = c("03/11/2014",
"10/10/2011", "17/09/2008"),
Motivotermotratamento = c(
"Tratamento Completado",
"Tratamento Completado",
"Tratamento Completado"
),
ano = c(2014L,
2010L, 2008L),
region = c("Centro", "Centro", "Centro")
),
row.names = c(NA, -3L),
class = c("data.table", "data.frame")
)
setkeyv(data1, key_variables)
setkeyv(old_data, key_variables)
data1[, c("birthdate", "sint", "cons", "diag", "end", "micro") := lapply(.SD, as.Date, format = "%m/%d/%Y"), .SDcols = c("birthdate", "sint", "cons", "diag", "end", "micro")]
old_data[, c("birthdate", "sint", "cons", "diag", "end", "micro") := lapply(.SD, as.Date, format = "%d/%m/%Y"), .SDcols = c("birthdate", "sint", "cons", "diag", "end", "micro")]
dataFinal <- merge(data1, old_data, key_variables)
推荐阅读
- mysql - COUNT 和 MAX 应用于 LEFT JOIN
- android - 错误 - 任务 ':app:processDebugGoogleServices' 执行失败。在 React Native 应用程序上安装 FireBase 时
- batch-file - 如何使用批处理脚本在文本文件中查找字符串
- xml - XML::LibXML : 在 perl 中提取 xml 标记的子和孙
- android - 使用 AutoComplteTextView 时如何避免调用 toString()
- parsing - 在 antlr4 语法中实现标记文本样式运算符时如何首先进行最短匹配?
- maven - rpmbuild - 在 prescriptlet 期间下载文件
- logging - 当文件不可访问时如何滚动日志文件
- angular - 在 Primeng 下拉菜单中添加日历
- javascript - 如何在mongo中编写nodejs脚本输出?