r - 短语匹配,无论它们的位置如何,用逗号分隔
问题描述
我有 2 个数据帧,需要将 df_1 与 df_2 进行比较,并从 col_2 的 df_2 获取类似的字符串,并将它们匹配的短语数存储在 df_out 数据帧中
col_1 = c("inside the world,worldwide web,google chrome app","world health organisation, from country", "team work","size of the country, bigger compared to other")
df_1 = data.frame(col_1)
col_2 = c("team work,in the company", "size of the country","inside the world,worldwide web,google chrome app", "google chrome app,worldwide web,inside the world","inside the world,google chrome app", "web worldwide","world health organisation, from country","from country",
"size of the country, bigger compared to other","country from", "world health organisation,country from")
df_2 = data.frame(col_2)
col_1 = c("inside the world,worldwide web,google chrome app","inside the world,worldwide web,google chrome app", "inside the world,worldwide web,google chrome app","inside the world,worldwide web,google chrome app", "world health organisation, from country","world health organisation, from country", "world health organisation, from country","world health organisation, from country",
"team work","size of the country, bigger compared to other","size of the country, bigger compared to other", "inside the world,worldwide web,google chrome app", "world health organisation, from country")
col_2 = c("inside the world,worldwide web,google chrome app","inside the world,google chrome app",
"google chrome app,worldwide web,inside the world", "web worldwide", "world health organisation, from country","from country", "country from", "world health organisation,country from","team work,in the company", "size of the country, bigger compared to other","size of the country","team work,in the company", "web worldwide")
match_percentage = c("1/1","2/3","3/3","1/3","2/2","1/2","1/1","2/2","1/1","2/2","1/2","0/3","0/2")
match_numeric_percentage = c(100,66.666,100,33.3333,100,50,100,100,100,100,50,0,0
)
df_out = data.frame(col_1,col_2,match_percentage,match_numeric_percentage)
解决方案
尝试这个:
df_temp <- data.frame(col_1,col_2)
df_out <- df_temp %>% select(col_1, col_2) %>% mutate(perc = ((mapply(function(x, y) length(intersect(x, y)),
strsplit(df_temp$col_1, ','), strsplit(df_temp$col_2, ','))) / (mapply(function(x) length((x)), strsplit(df_temp$col_1, ',')))*100),
intersect = (mapply(function(x, y) length(intersect(x, y)), strsplit(df_temp$col_1, ','), strsplit(df_temp$col_2, ','))) , n_col_1 = (mapply(function(x) length((x)), strsplit(df_temp$col_1, ','))))
df_out:
col_1 col_2 perc intersect n_col_1
1 inside the world,worldwide web,google chrome app inside the world,worldwide web,google chrome app 100.00000 3 3
2 inside the world,worldwide web,google chrome app inside the world,google chrome app 66.66667 2 3
3 inside the world,worldwide web,google chrome app google chrome app,worldwide web,inside the world 100.00000 3 3
4 inside the world,worldwide web,google chrome app worldwide web 33.33333 1 3
5 world health organisation, from country world health organisation, from country 100.00000 2 2
6 world health organisation, from country from country 0.00000 0 2
7 world health organisation, from country from country 0.00000 0 2
8 world health organisation, from country world health organisation,from country 50.00000 1 2
9 team work team work,in the company 100.00000 1 1
10 size of the country, bigger compared to other size of the country, bigger compared to other 100.00000 2 2
11 size of the country, bigger compared to other size of the country 50.00000 1 2
12 inside the world,worldwide web,google chrome app team work,in the company 0.00000 0 3
13 world health organisation, from country worldwide web 0.00000 0 2