首页 > 解决方案 > 将字符串列表中的每个字符串元素编码为 R 数据框列中的数字列表

问题描述

我是 R 新手,希望实现在这篇文章中看到的内容:Create dummy variables from string with multiple values for each column in my df. 也就是说,我想将每个字符串元素编码为一个虚拟变量,然后存储每个返回列表,因为它是编码值。我已经在 Python 中完成了此操作,但不确定在 R 中执行此操作的最佳方法。请在下面找到预期输出。目前

|          Databases                               |            Cryptocurrency                              |          Visual

|:------------------------------------------------:|:------------------------------------------------------:|:----------------------------------------------------------:|

|c("PostgreSQL","MySQL","SQL")                     |c("Ethereum","Cryptocurrency","Bitcoin","Blockchain")   |c("BrandDesign","GraphicDesign","LogoDesign","PackageDesign")
| MSSQLManagement                                  |character(0)                                            |c("BrandDesign", "GraphicDesign", "LogoDesign")
|c("MongoDB","Redis","MySQL")                      |c("Bitcoin", "Blockchain")                              |character(0)
|c("RedisManagement")                              | Cryptocurrency                                         |c("BrandDesign", "GraphicDesign")
|c("MySQL","MemcachedManagement","MongoDB","Redis")|c("Cryptocurrency", "Ethereum", "Blockchain")           | GraphicDesign

预期输出:

| Databases |  Cryptocurrency  |   Visual     |

|:---------:|:----------------:|:------------:|

|c(1,2,3)   |c(1,2,3,4)        |c(1,2,3,4)    |
| 4         | 0                |c(1, 2, 3)    |
|c(5,6,2)   |c(3, 4)           | 0            |
|c(6)       | 2                |c(1, 2)       |
|c(2,7,5,6) |c(2, 1, 4)        | 2            |

最终目标是在 KNN 模型上训练数据。

编辑:以下输出是dput(df[1:25, c(11, 17)])的输出:

structure(list(Technical = list(c("Architecture", "TechnicalDesign", 
"Documentation", "RequirementsGathering"), character(0), character(0), 
    c("Documentation", "TechnicalDesign", "RequirementsGathering"
    ), c("Architecture", "TechnicalDesign", "Documentation", 
    "RequirementsGathering"), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    ), character(0), c("Architecture", "Documentation", "RequirementsGathering"
    ), c("Architecture", "TechnicalDesign", "Documentation", 
    "RequirementsGathering"), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    ), c("Architecture", "TechnicalDesign", "Documentation", 
    "RequirementsGathering"), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    ), c("Documentation", "TechnicalDesign", "RequirementsGathering"
    ), character(0), character(0), c("Architecture", "TechnicalDesign", 
    "Documentation", "RequirementsGathering"), c("Architecture", 
    "TechnicalDesign", "Documentation", "RequirementsGathering"
    )), Tools = list(c("Photoshop", "Sketch", "InVision", "Illustrator", 
"Zeplin"), c("GoogleAnalytics", "GoogleAdsense", "MailChimp", 
"GoogleAdwords"), character(0), c("FacebookAds", "GoogleAnalytics", 
"MailChimp"), c("FacebookAds", "GoogleAnalytics", "MailChimp"
), c("Kubernetes", "Vagrant"), c("Gulp", "Vagrant"), character(0), 
    c("GoogleAnalytics", "GoogleAdwords"), c("Hubspot", "Kissmetrics", 
    "InstagramAds", "FacebookAds", "LinkedInAds", "Optimizely", 
    "GoogleAnalytics"), "GoogleAnalytics", c("FacebookAds", "GoogleAnalytics", 
    "GoogleAdsense", "InstagramAds", "GoogleAdwords"), c("GoogleAnalytics", 
    "TwitterAds", "MailChimp"), c("GoogleAnalytics", "MailChimp", 
    "Mixpanel"), c("GoogleAnalytics", "MailChimp", "GoogleAdwords"
    ), c("Photoshop", "Sketch", "InVision", "Illustrator"), character(0), 
    character(0), character(0), c("FacebookAds", "GoogleAnalytics", 
    "Optimizely", "GoogleAdwords", "Mixpanel")), ProjectManagement = list(
    c("AgileMethodology", "ClientManagement", "Scrum"), c("AgileMethodology", 
    "ClientManagement", "Scrum"), "ClientManagement", c("AgileMethodology", 
    "ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement", 
    "Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "ClientManagement"), character(0), c("AgileMethodology", 
    "ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement", 
    "Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "ClientManagement", "Scrum"), character(0), c("ClientManagement", 
    "Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "ClientManagement", "Scrum"), c("AgileMethodology", "Scrum"
    ), "AgileMethodology", c("AgileMethodology", "Scrum"), c("AgileMethodology", 
    "Scrum")), Visual = list(c("BrandDesign", "GraphicDesign", 
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign", 
"LogoDesign"), character(0), character(0), c("BrandDesign", "GraphicDesign", 
"LogoDesign", "PackageDesign"), character(0), character(0), character(0), 
    character(0), c("BrandDesign", "GraphicDesign", "LogoDesign"
    ), character(0), character(0), c("BrandDesign", "GraphicDesign", 
    "LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign", 
    "LogoDesign", "PackageDesign"), character(0), character(0), 
    character(0), character(0), character(0), c("BrandDesign", 
    "GraphicDesign", "LogoDesign")), ConfigurationManagement = list(
    character(0), "Chef", character(0), character(0), character(0), 
    c("Ansible", "Terraform", "Puppet"), character(0), character(0), 
    character(0), c("Puppet", "Chef"), c("Terraform", "Chef"), 
    character(0), character(0), character(0), character(0), character(0), 
    character(0), character(0), "Chef", character(0)), Containers = list(
    character(0), "Docker", character(0), character(0), character(0), 
    "Docker", "Docker", "Docker", "Docker", "Docker", "Docker", 
    "Docker", character(0), character(0), "Docker", "Docker", 
    character(0), character(0), "Docker", "Docker"), Cryptocurrency = list(
    character(0), character(0), character(0), character(0), character(0), 
    c("Ethereum", "Cryptocurrency", "Bitcoin", "Blockchain"), 
    character(0), character(0), character(0), c("Ethereum", "Cryptocurrency", 
    "Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency", 
    "Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency", 
    "Bitcoin", "Blockchain"), character(0), character(0), character(0), 
    character(0), character(0), character(0), character(0), character(0))), row.names = c(NA, 
20L), class = "data.frame")

标签: rencodingdummy-variable

解决方案


编写一个重新编码值的函数。

recode_values <- function(x) {
  x1 <- strsplit(x, ',\\s*')
  relist(match(unlist(x1), unique(unlist(x1))), x1)
}

清理字符串并将其应用于多个列。

library(dplyr)

df %>% mutate(across(c(AutomatedTesting, Cryptocurrency), 
             ~recode_values(gsub("\\[|\\]|'", "", .))))

#   AutomatedTesting Cryptocurrency
#1                                 
#2        1, 2, 3, 4               
#3                 1               
#4                 2               
#5                                 
#6              2, 1     1, 2, 3, 4
#7           1, 2, 5               
#8              2, 1               
#9           2, 3, 1               
#10    1, 2, 6, 3, 5     1, 2, 3, 4
#11          2, 6, 1  5, 1, 2, 3, 4
#12             2, 1  5, 1, 2, 3, 4
#13                                
#14                                
#15    1, 2, 6, 3, 4               
#16       1, 2, 6, 5               
#17          2, 3, 1               
#18                                
#19                2               
#20 4, 5, 6, 3, 1, 2               
#21             2, 4               
#22 4, 5, 6, 3, 1, 2               
#23             2, 1               
#24          2, 6, 1               
#25       2, 6, 3, 1               

推荐阅读