r - 不同数据帧(不同长度)的列之间的模糊匹配
问题描述
我有两个数据框,其中包含来自两个不同来源的国家级数据。数据框包含国家名称和某个变量的数量(我只是弥补了数量以避免数字太长)。请参阅下面的数据框示例。问题是两个来源之间的国家命名不同,在某些情况下,相似的县名可能因两个来源中的拼写不同而不同(例如,df1 中的 CentralAfricanRepublic 是 df2 中的中非共和国)。我想使用 df1 作为参考,以使用某种模糊匹配来识别两个数据框中相同的国家,然后仅从 df2 中选择出现在 df1 中的国家的数据。请注意,df2 有更多国家。
df1 <- data.frame(country=c("Argentina", "Bahamas", "Barbados", "Belize", "Bolivia", "Brazil", "BritishVirginIslands", "Chile", "Colombia", "CostaRica", "Cuba",
"Dominica", "DominicanRepublic", "Ecuador", "ElSalvador", "FrenchGuiana", "Guadeloupe", "Guatemala", "Guyana", "Haiti", "Honduras", "Jamaica",
"Martinique", "Mexico", "Nicaragua", "Panama", "Paraguay", "Peru","StLucia", "StVincent", "Suriname", "TrinidadandTobago", "Uruguay",
"Venezuela", "Bangladesh", "Bhutan", "Brunei", "Cambodia", "India", "Indonesia", "Laos", "Malaysia", "Myanmar", "Nepal", "Pakistan",
"Philippines", "PapuaNewGuinea", "Singapore", "SriLanka", "Thailand", "TimorLeste", "Vietnam", "Angola", "Benin", "Botswana", "BurkinaFaso",
"Burundi", "Cameroon", "CentralAfricanRepublic", "Chad", "Congo","Djibouti", "DemocraticRepublicoftheCongo", "EquatorialGuinea",
"Eritrea", "Ethiopia", "Gabon", "Gambia", "Ghana", "Guinea","GuineaBissau", "IvoryCoast", "Kenya", "Lesotho", "Liberia",
"Madagascar", "Malawi", "Mauritania", "Mali", "Mozambique", "Namibia","Niger", "Nigeria", "Rwanda", "Sudan", "Senegal", "SierraLeone",
"Somalia", "SouthAfrica", "SouthSudan", "Swaziland", "Tanzania", "Togo", "Uganda", "Zambia", "Zimbabwe", "Canada", "UnitedStatesofAmerica",
"Albania", "Andorra", "Austria", "Belgium", "Bosnia", "Bulgaria", "Croatia", "CzechRepublic", "Denmark", "Finland", "France", "Germany",
"Greece", "Hungary", "Iceland", "Ireland", "Italy", "Liechtenstein","Luxembourg", "Macedonia", "Malta", "Montenegro", "Netherlands",
"Norway", "Poland", "Portugal", "Romania", "Serbia", "Slovakia","Slovenia", "Spain", "Sweden", "Switzerland", "UnitedKingdom",
"Afghanistan", "Algeria", "Bahrain", "Cyprus", "Egypt", "Iran","Iraq", "Israel", "Jordan", "Kuwait", "Lebanon", "Libya", "Morocco",
"Oman", "Qatar", "SaudiArabia", "Syria", "Tunisia", "Turkey","UnitedArabEmirates", "WesternSahara", "Yemen", "Armenia", "Azerbaijan",
"Belarus", "Estonia", "Georgia", "Kazakhstan", "Kyrgyzstan", "Latvia", "Lithuania", "Moldova", "Russia", "Tajikistan", "Turkmenistan",
"Ukraine", "Uzbekistan", "Japan", "Mongolia", "NorthKorea", "SouthKorea","China", "Australia", "CookIslands", "Fiji", "FrenchPolynesia",
"Micronesia", "NewCaledonia", "NewZealand", "Niue", "Samoa", "SolomonIslands", "Tonga", "Vanuatu")
Amounts=c(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120,
122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146,
148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172,
174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198,
200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224,
226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250,
252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276,
278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302,
304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328,
330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354,
356, 358, 360, 362, 364, 366, 368, 370))
df2 <- data.frame(country=c("Afghanistan", "Albania", "Algeria", "American Samoa", "Andorra",
"Angola", "Anguilla", "Antigua and Barbuda", "Argentina", "Armenia",
"Aruba", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain",
"Bangladesh", "Barbados", "Belarus", "Belgium", "Belgium-Luxembourg",
"Belize", "Benin", "Bermuda", "Bhutan", "Bolivia (Plurinational State of)",
"Bosnia and Herzegovina", "Botswana", "Brazil", "British Virgin Islands",
"Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde",
"Cambodia", "Cameroon", "Canada", "Cayman Islands", "Central African Republic",
"Chad", "Channel Islands", "Chile", "China", "China, Hong Kong SAR",
"China, Macao SAR", "China, mainland", "China, Taiwan Province of",
"Colombia", "Comoros", "Congo", "Cook Islands", "Costa Rica",
"Côte d'Ivoire", "Croatia", "Cuba", "Cyprus", "Czechia", "Czechoslovakia",
"Democratic People's Republic of Korea", "Democratic Republic of the Congo",
"Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador",
"Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia",
"Eswatini", "Ethiopia", "Ethiopia PDR", "Falkland Islands (Malvinas)",
"Faroe Islands", "Fiji", "Finland", "France", "French Guyana",
"French Polynesia", "Gabon", "Gambia", "Georgia", "Germany",
"Ghana", "Gibraltar", "Greece", "Greenland", "Grenada", "Guadeloupe",
"Guam", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti",
"Holy See", "Honduras", "Hungary", "Iceland", "India", "Indonesia",
"Iran (Islamic Republic of)", "Iraq", "Ireland", "Isle of Man",
"Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan",
"Kenya", "Kiribati", "Kuwait", "Kyrgyzstan", "Lao People's Democratic Republic",
"Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein",
"Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia",
"Maldives", "Mali", "Malta", "Marshall Islands", "Martinique",
"Mauritania", "Mauritius", "Mayotte", "Mexico", "Micronesia (Federated States of)",
"Monaco", "Mongolia", "Montenegro", "Montserrat", "Morocco",
"Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands",
"Netherlands Antilles (former)", "New Caledonia", "New Zealand",
"Nicaragua", "Niger", "Nigeria", "Niue", "Norfolk Island", "North Macedonia",
"Northern Mariana Islands", "Norway", "Oman", "Pacific Islands Trust Territory",
"Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea",
"Paraguay", "Peru", "Philippines", "Pitcairn", "Poland", "Portugal",
"Puerto Rico", "Qatar", "Republic of Korea", "Republic of Moldova",
"Réunion", "Romania", "Russian Federation", "Rwanda", "Saint Helena, Ascension and Tristan da Cunha",
"Saint Kitts and Nevis", "Saint Lucia", "Saint Pierre and Miquelon",
"Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe",
"Saudi Arabia", "Senegal", "Serbia", "Serbia and Montenegro",
"Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia",
"Solomon Islands", "Somalia", "South Africa", "South Sudan",
"Spain", "Sri Lanka", "Sudan", "Sudan (former)", "Suriname",
"Svalbard and Jan Mayen Islands", "Sweden", "Switzerland", "Syrian Arab Republic",
"Tajikistan", "Thailand", "Timor-Leste", "Togo", "Tokelau", "Tonga",
"Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Turks and Caicos Islands",
"Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom of Great Britain and Northern Ireland",
"United Republic of Tanzania", "United States of America", "United States Virgin Islands",
"Uruguay", "USSR", "Uzbekistan", "Vanuatu", "Venezuela (Bolivarian Republic of)",
"Viet Nam", "Wallis and Futuna Islands", "Western Sahara", "Yemen",
"Yugoslav SFR", "Zambia", "Zimbabwe"),
Amounts=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240,
241, 242, 243)
)
有谁知道实现这一目标的最巧妙方法?
解决方案
这是一个fuzzyjoin
示例...您可能必须尝试各种 stringdist 方法和函数的参数,以获得最适合您需求的结果。
library( fuzzyjoin )
stringdist_left_join( df1, df2, by = "country", method = "lcs", max_dist = 1 )
# country.x Amounts.x country.y Amounts.y
# 1 Argentina 0 Argentina 9
# 2 Bahamas 2 Bahamas 15
# 3 Barbados 4 Barbados 18
# 4 Belize 6 Belize 22
# 5 Bolivia 8 <NA> NA
# 6 Brazil 10 Brazil 29
# 7 BritishVirginIslands 12 British Virgin Islands 30
# 8 Chile 14 Chile 43
# 9 Colombia 16 Colombia 49
# ....
推荐阅读
- swift - “数组索引超出范围:”从数组中获取范围时
- azure - 使用 Azure ActiveDirectory 作为 IdP 注销 Shibboleth 服务提供程序时的 opensaml::BindingException
- algorithm - 将值从一个范围重新计算到另一个范围
- jenkins - 我如何解析具有 json 的文本文件并获取数组元素
- asp.net-core-3.1 - Razor Pages 密码输入框一直显示默认点
- left-join - 对多个表进行左连接时出现雪花“爆炸连接”问题
- python - 计算python中数字的标准差
- javascript - 不知道为什么这个解决方案返回未定义(数组算法)
- javascript - 将“3+5/2”之类的字符串转换为数字
- python - VSCode for Python 中的函数参数建议/自动完成