首页 > 解决方案 > 我有两个数据框,说明了两个对象的不同特征,我想将它们组合起来,使一个数据框对两者进行比较

问题描述

我有以下 data.frames (下面的代码)

  Fruit     Apple                                                
1 countries Germany,Austria,Netherlands, USA
2 varieties A,B,C      

  Fruit     Banana                                                             
1 countries Poland,Germany,France,Netherlands, UK
2 varieties N,C,A  

我想将它们合并到:

   Col1      Col2        Banana Apple
 1 countries Poland           1     0
 2 countries Germany          1     1
 3 countries France           1     0
 4 countries Netherlands      1     1
 5 countries UK               1     0
 6 countries Austria          0     1
 7 countries USA              0     1
 8 varieties A                1     1
 9 varieties B                0     1
10 varieties C                1     1
11 varieties N                1     0

这是两个数据帧的代码和预期的结果:

 c<-structure(list(Fruit = c("countries", "varieties"), Apple = c("Germany,Austria,Netherlands, USA", 
                                                                     "A,B,C")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
                                                                                                                  "data.frame"))


d<-structure(list(Fruit = c("countries", "varieties"), Banana = c("Poland,Germany,France,Netherlands, UK", 
                                                                  "N,C,A")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
                                                                                                               "data.frame"))

 Result<-structure(list(Col1 = c("countries", "countries", "countries", 
"countries", "countries", "countries", "countries", "varieties", 
"varieties", "varieties", "varieties"), Col2 = c("Poland", "Germany", 
"France", "Netherlands", "UK", "Austria", "USA", "A", "B", "C", 
"N"), Banana = c(1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1), Apple = c(0, 
1, 0, 1, 0, 1, 1, 1, 1, 1, 0)), row.names = c(NA, -11L), class = c("tbl_df", 
"tbl", "data.frame"))

编辑:我已经能够按照我想要的方式合并两个数据框。代码有点草率,但它可以工作。

c<-separate_rows(c, Apple , convert = TRUE)
c<- c%>% mutate(P_Apple=1)

d<-separate_rows(d, Banana , convert = TRUE)
d<- d%>% mutate(P_Banana=1)

Result<-full_join (c,d, by=c("Apple"="Banana"))

Result<-Result%>% mutate(Fruit=case_when(
  is.na(Fruit.x)~Fruit.y,
  !is.na(Fruit.x)~Fruit.x )) %>% select(c("Fruit","Apple","P_Apple","P_Banana")) %>%rename(Features="Fruit",Fruit="Apple")

Result[is.na(Result)]<-0

标签: rmergecomparison

解决方案


请在下面找到所需的输出。如果您需要任何补充,请告诉我。

library(dummies)
library(splitstackshape)
library(tibble)

#Your Data

c<-structure(list(Fruit = c("countries", "varieties"), Apple = c("Germany,Austria,Netherlands, USA", 
                                                                 "A,B,C")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
                                                                                                              "data.frame"))


d<-structure(list(Fruit = c("countries", "varieties"), Banana = c("Poland,Germany,France,Netherlands, UK", 
                                                                  "N,C,A")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
                                                                                                               "data.frame"))


#Transform the two data frames and rbind

e = data.frame(t(c),stringsAsFactors = F) %>% tibble::rownames_to_column()
names(e) = as.character(e[1,]);e = e[-1,]
f = data.frame(t(d),stringsAsFactors = F) %>% tibble::rownames_to_column()
names(f) = as.character(f[1,]);f = f[-1,]

g = data.frame(t(cSplit(rbind(e,f), c("countries","varieties"), sep=",", direction = "wide")),stringsAsFactors = F) %>% tibble::rownames_to_column()
names(g) = as.character(g[1,]);g = g[-1,]

#Removing leading spaces for UK and USA
unique_countries = gsub("^\\s+","",unlist(strsplit(rbind(e,f)$countries,",")))

cmatrix  = g %>% select(-Fruit) %>% filter_all(any_vars(!is.na(.))) %>% dummy.data.frame(.,sep = "_") %>% t() %>% 
  data.frame(.,stringsAsFactors = F) %>% tibble::rownames_to_column() %>%
  separate(rowname,c("Fruit","Category"),"_") %>% select(Fruit,Category) %>%
  mutate(Flag = 1) %>% spread(Fruit,Flag) %>% na.fill(0) %>% data.frame(.,stringsAsFactors = F) %>%
  filter(!Category == "NA") %>%
  mutate(Parent_category = ifelse(Category %in% unique_countries,"countries","varieties"))


#And the Output
> cmatrix
      Category Apple Banana Parent_category
1            A     1      1       varieties
2      Austria     1      0       countries
3            B     1      0       varieties
4            C     1      1       varieties
5       France     0      1       countries
6      Germany     1      1       countries
7            N     0      1       varieties
8  Netherlands     1      1       countries
9       Poland     0      1       countries
10          UK     0      1       countries
11         USA     1      0       countries

推荐阅读