首页 > 解决方案 > 短语匹配,无论它们的位置如何,用逗号分隔

问题描述

我有 2 个数据帧,需要将 df_1 与 df_2 进行比较,并从 col_2 的 df_2 获取类似的字符串,并将它们匹配的短语数存储在 df_out 数据帧中

col_1 = c("inside the world,worldwide web,google chrome app","world health organisation, from country", "team work","size of the country, bigger compared to other")
df_1 = data.frame(col_1)


col_2 = c("team work,in the company", "size of the country","inside the world,worldwide web,google chrome app",     "google chrome app,worldwide web,inside the world","inside the world,google chrome app",  "web worldwide","world health organisation, from country","from country",
          "size of the country, bigger compared to other","country from", "world health organisation,country from")
df_2 = data.frame(col_2)


col_1 = c("inside the world,worldwide web,google chrome app","inside the world,worldwide web,google chrome app",          "inside the world,worldwide web,google chrome app","inside the world,worldwide web,google chrome app",          "world health organisation, from country","world health organisation, from country",          "world health organisation, from country","world health organisation, from country",
          "team work","size of the country, bigger compared to other","size of the country, bigger compared to other",          "inside the world,worldwide web,google chrome app", "world health organisation, from country")

col_2 = c("inside the world,worldwide web,google chrome app","inside the world,google chrome app",
          "google chrome app,worldwide web,inside the world", "web worldwide",          "world health organisation, from country","from country", "country from",          "world health organisation,country from","team work,in the company",          "size of the country, bigger compared to other","size of the country","team work,in the company",          "web worldwide")
match_percentage = c("1/1","2/3","3/3","1/3","2/2","1/2","1/1","2/2","1/1","2/2","1/2","0/3","0/2")
match_numeric_percentage = c(100,66.666,100,33.3333,100,50,100,100,100,100,50,0,0
                             )
df_out =  data.frame(col_1,col_2,match_percentage,match_numeric_percentage)

df_out 数据框的解释
在此处输入图像描述

标签: rdplyrstringdist

解决方案


尝试这个:

df_temp <- data.frame(col_1,col_2)
df_out <- df_temp %>% select(col_1, col_2) %>% mutate(perc = ((mapply(function(x, y) length(intersect(x, y)), 
        strsplit(df_temp$col_1, ','), strsplit(df_temp$col_2, ','))) / (mapply(function(x) length((x)), strsplit(df_temp$col_1, ',')))*100), 
        intersect = (mapply(function(x, y) length(intersect(x, y)), strsplit(df_temp$col_1, ','), strsplit(df_temp$col_2, ','))) ,  n_col_1 = (mapply(function(x) length((x)), strsplit(df_temp$col_1, ','))))

df_out:

                                              col_1                                            col_2      perc intersect n_col_1
1  inside the world,worldwide web,google chrome app inside the world,worldwide web,google chrome app 100.00000         3       3
2  inside the world,worldwide web,google chrome app               inside the world,google chrome app  66.66667         2       3
3  inside the world,worldwide web,google chrome app google chrome app,worldwide web,inside the world 100.00000         3       3
4  inside the world,worldwide web,google chrome app                                    worldwide web  33.33333         1       3
5           world health organisation, from country          world health organisation, from country 100.00000         2       2
6           world health organisation, from country                                     from country   0.00000         0       2
7           world health organisation, from country                                     from country   0.00000         0       2
8           world health organisation, from country           world health organisation,from country  50.00000         1       2
9                                         team work                         team work,in the company 100.00000         1       1
10    size of the country, bigger compared to other    size of the country, bigger compared to other 100.00000         2       2
11    size of the country, bigger compared to other                              size of the country  50.00000         1       2
12 inside the world,worldwide web,google chrome app                         team work,in the company   0.00000         0       3
13          world health organisation, from country                                    worldwide web   0.00000         0       2

推荐阅读