首页 > 解决方案 > 合并 data.table 返回空变量 - R

问题描述

我有两个data.tables我正在尝试合并。但是,这些行中的这些行data.tables需要大量变量以避免重复。由于机密性数据,我们没有标识符变量,我需要结合几个变量来匹配这两个数据集。

我试图加入他们,但是一旦我查看最终数据集,变量就是空的。所有值都设置为 NULL。data1有 17440 个观测值和 57 个变量。old_data有 17347 个观测值和 12 个变量。我需要 11 个变量来获得独特的观察结果,我们来命名它们key_variables。这是我所拥有的:

key_variables <- c("sex", "birthdate", "sint", "cons", "diag", "concelho", "Serologia", "alcohol", "end", "micro")

setkeyv(data1, key_variables)
setkeyv(old_data, key_variables)

dataFinal <- merge(data1, old_data, key_variables, all.x = T)

我要添加的变量data1是一个因素。我试图更改为字符,但我仍然将变量设置为 NULL。知道什么可能导致这个问题吗?

str(old_data)
Classes ‘data.table’ and 'data.frame':  17347 obs. of  12 variables:
 $ sex            : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "llevels")= int  1 2
  ..- attr(*, "label")= chr "Sex"
 $ birthdate      : labelled, format: NA NA ...
 $ diagnosis_date : labelled, format: "2009-01-09" "2009-10-15" ...
 $ county         : Factor w/ 300 levels "Lisboa","Sines",..: 23 62 244 34 18 37 1 27 60 66 ...
  ..- attr(*, "llevels")= int  11 1 2 3 4 5 6 7 8 9 ...
  ..- attr(*, "label")= chr "County"

str(data)
Classes ‘data.table’ and 'data.frame':  17440 obs. of  57 variables:
  $ ID               : chr  "12083" "12084" "12087" "12096" ...
  $ sex              : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
   ..- attr(*, "llevels")= int  1 2
  $ birthdate        : Date, format: NA NA ...
  $ county           : Factor w/ 300 levels "Lisboa","Sines",..: 17 17 50 235 25 84 28 1 20 47 ...
   ..- attr(*, "llevels")= int  10 1 2 3 4 5 6 7 8 9 ..


dput(data1)
structure(list(sex = c("Masculino", "Masculino", "Masculino"), 
birthdate = c("4/23/1952", "11/26/1964", "01/08/1965"), sint = c("01/01/2014", 
"09/01/2010", "01/01/2008"), cons = c("02/10/2014", "12/01/2010", 
"1/29/2008"), diag = c("02/10/2014", "12/03/2010", "02/03/2008"
), concelho = c("vila velha de ródão", "vila velha de ródão", 
"vila velha de ródão"), Serologia = c("Não", "Não", "Não"
), alcohol = c("Sim", "Não", "Sim"), end = c("11/03/2014", 
"10/10/2011", "9/17/2008"), micro = c("03/11/2008", "12/03/2010", 
"02/03/2008"), DInflamatoriaArticular = c("Não", "Não", "Não"
)), row.names = c(NA, -3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001f2af621ef0>)

dput(old_data)
structure(list(sex = c("Masculino", "Masculino", "Masculino"), 
birthdate = c("23/04/1952", "26/11/1964", "08/01/1965"), 
age = c(61L, 46L, 43L), concelho = c("vila velha de ródão", 
"vila velha de ródão", "vila velha de ródão"), EstadoVital = c("Vivo", 
"Vivo", "Vivo"), sint = c("01/01/2014", "01/09/2010", "01/01/2008"
), cons = c("10/02/2014", "01/12/2010", "29/01/2008"), alcohol = c("Sim", 
"Não", "Sim"), drugs = c("Não", "Não", "Não"), micro = c("11/03/2008", 
"03/12/2010", "03/02/2008"), diag = c("10/02/2014", "03/12/2010", 
"03/02/2008"), Serologia = c("Não", "Não", "Não"), end = c("03/11/2014", 
"10/10/2011", "17/09/2008"), Motivotermotratamento = c("Tratamento Completado", 
"Tratamento Completado", "Tratamento Completado"), ano = c(2014L, 
2010L, 2008L), region = c("Centro", "Centro", "Centro")), row.names = c(NA,-3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001f2af621ef0>)

标签: rmergedata.table

解决方案


正如您在评论中已经提到的,两个表格的时间格式不同。这是对齐它们的可能性:

library(data.table)

key_variables <-
  c(
    "sex",
    "birthdate",
    "sint",
    "cons",
    "diag",
    "concelho",
    "Serologia",
    "alcohol",
    "end",
    "micro"
  )

data1 <-
  structure(
    list(
      sex = c("Masculino", "Masculino", "Masculino"),
      birthdate = c("4/23/1952", "11/26/1964", "01/08/1965"),
      sint = c("01/01/2014",
               "09/01/2010", "01/01/2008"),
      cons = c("02/10/2014", "12/01/2010",
               "1/29/2008"),
      diag = c("02/10/2014", "12/03/2010", "02/03/2008"),
      concelho = c("vila velha de ródão", "vila velha de ródão",
                   "vila velha de ródão"),
      Serologia = c("Não", "Não", "Não"),
      alcohol = c("Sim", "Não", "Sim"),
      end = c("11/03/2014",
              "10/10/2011", "9/17/2008"),
      micro = c("03/11/2008", "12/03/2010",
                "02/03/2008"),
      DInflamatoriaArticular = c("Não", "Não", "Não")
    ),
    row.names = c(NA,-3L),
    class = c("data.table", "data.frame")
  )

old_data <-
  structure(
    list(
      sex = c("Masculino", "Masculino", "Masculino"),
      birthdate = c("23/04/1952", "26/11/1964", "08/01/1965"),
      age = c(61L, 46L, 43L),
      concelho = c("vila velha de ródão",
                   "vila velha de ródão", "vila velha de ródão"),
      EstadoVital = c("Vivo",
                      "Vivo", "Vivo"),
      sint = c("01/01/2014", "01/09/2010", "01/01/2008"),
      cons = c("10/02/2014", "01/12/2010", "29/01/2008"),
      alcohol = c("Sim",
                  "Não", "Sim"),
      drugs = c("Não", "Não", "Não"),
      micro = c("11/03/2008",
                "03/12/2010", "03/02/2008"),
      diag = c("10/02/2014", "03/12/2010",
               "03/02/2008"),
      Serologia = c("Não", "Não", "Não"),
      end = c("03/11/2014",
              "10/10/2011", "17/09/2008"),
      Motivotermotratamento = c(
        "Tratamento Completado",
        "Tratamento Completado",
        "Tratamento Completado"
      ),
      ano = c(2014L,
              2010L, 2008L),
      region = c("Centro", "Centro", "Centro")
    ),
    row.names = c(NA, -3L),
    class = c("data.table", "data.frame")
  )

setkeyv(data1, key_variables)
setkeyv(old_data, key_variables)

data1[, c("birthdate", "sint", "cons", "diag", "end", "micro") := lapply(.SD, as.Date, format = "%m/%d/%Y"), .SDcols = c("birthdate", "sint", "cons", "diag", "end", "micro")]
old_data[, c("birthdate", "sint", "cons", "diag", "end", "micro") := lapply(.SD, as.Date, format = "%d/%m/%Y"), .SDcols = c("birthdate", "sint", "cons", "diag", "end", "micro")]

dataFinal <- merge(data1, old_data, key_variables)

推荐阅读