首页 > 解决方案 > tidyverse:使用一些信息作为最终变量的文本文件操作

问题描述

我尝试了一些有点复杂的 *txt 操作,但我在使用 tidyverse包时犯了一些错误。在我的例子中:

#Package
library(tidyverse)

# First a read my data set
myfile<-read_lines("https://raw.githubusercontent.com/Leprechault/trash/main/result_imgall_f_test.txt")

# Look a small piece
# [1] "Start processing imgall/sample_59178.jpg"                         
# [2] "imgall/sample_59178.jpg: Predicted in 7337.640000 milli-seconds." 
# [3] "End"                                                              
# [4] "Start processing imgall/sample_34773.jpg"                         
# [5] "imgall/sample_34773.jpg: Predicted in 7376.639000 milli-seconds." 
# [6] "End"                                                              
# [7] "Start processing imgall/sample_24908.jpg"                         
# [8] "imgall/sample_24908.jpg: Predicted in 7412.858000 milli-seconds." 
# [9] "End"                                                              
#[10] "Start processing imgall/sample_18922.jpg"                         
#[11] "imgall/sample_18922.jpg: Predicted in 7424.998000 milli-seconds." 
#[12] "End"                                                              
#[13] "Start processing imgall/sample_31653.jpg"                         
#[14] "imgall/sample_31653.jpg: Predicted in 7311.578000 milli-seconds." 
#[15] "lca: 90.681282%\tleft_x:   18\ttop_y:   33\twidth:   18\theight:   30"
#[16] "End"                                                              
#[17] "Start processing imgall/sample_17341.jpg"                         
#[18] "imgall/sample_17341.jpg: Predicted in 7418.365000 milli-seconds." 
#[19] "End"                                                              
#[20] "Start processing imgall/sample_11440.jpg"                         
#[21] "imgall/sample_11440.jpg: Predicted in 7365.160000 milli-seconds." 
#[22] "lca: 66.567978%\tleft_x:   54\ttop_y:   34\twidth:   18\theight:   23"
#[23] "lca: 33.219677%\tleft_x:   74\ttop_y:   15\twidth:   23\theight:   22"
#[24] "End" 

我想在理想的输出中创建一个仅包含 lca 信息的最终数据集:

#[1] left top width height obj_id        lca
#[2] 54   34  18    23     sample_11440  66.567978
#[3] 74   15  23    22     sample_11440  33.219677   
#[4]  1   38  19    28     sample_40452  66.658073
`

尝试类似:


names_col <- c("left", "top", "width", "height", "obj_id","lca")

mydf <- myfile %>%
  str_subset("lca$") %>%
  enframe(name = NULL) %>%
  separate(col = value, into = names_col, sep = "[\t]") 
mydf
# A tibble: 0 x 6
# ... with 6 variables: left <dbl>, top <dbl>, width <dbl>, height <dbl>, obj_id <dbl>, lca <dbl>

没有成功!!请问,有什么想法吗?

标签: rdplyr

解决方案


library(tidyverse)
library(vroom)

myfile<-vroom("https://raw.githubusercontent.com/Leprechault/trash/main/result_imgall_f_test.txt",delim = "\t")
myfile <- myfile %>% rename(dados = `Start processing imgall/sample_59178.jpg`)
myfile %>%
  filter(str_detect(dados, "top|sample")) %>%
  filter(!str_detect(dados, "Predict")) %>%
  separate(dados, c("Start", "obj_id"), sep = "/") %>%
  as.data.frame() %>%
  mutate(Start = ifelse(Start == "Start processing imgall",NA,Start)) %>%
  separate(Start, c("lca", "left","top","width","height"), sep= ":  ") %>%
  mutate_all(parse_number) %>%
  fill(., .direction = "downup", obj_id) %>% na.omit() %>%
  select(left, top,width,height,obj_id,lca) %>%
  mutate(obj_id = paste0("sample_",obj_id))
#  left top width height       obj_id      lca
#1   18  33    18     30 sample_31653 90.68128
#2   54  34    18     23 sample_11440 66.56798
#3   74  15    23     22 sample_11440 33.21968
#4    1  38    19     28 sample_40452 66.65807

推荐阅读