首页 > 解决方案 > 注释句子中的单词成分

问题描述

我正在使用 openNLP 来注释整个文本中句子中的单词。作为最终结果,我希望单词 ID 匹配它们在每个句子中的顺序,每次输入新句子时顺序从 1 开始)。这是我到目前为止所拥有的:

#create string
string  <-  paste0("Last morning, I went to the lake and sat. My dog is the cutest.")
ex_string  <-  as.String(string)
#annotate words and sentences
init_s_w  <-  annotate(ex_string, list(Maxent_Sent_Token_Annotator(probs=TRUE),
                                       Maxent_Word_Token_Annotator(probs=TRUE)))
init_s_w 
ID 类型 开始 结尾
1 句子 1 41
2 句子 43 63
3 单词 1 4
4 单词 6 12
5 单词 13 13
6 单词 15 15
7 单词 17 20
8 单词 22 23
9 单词 25 27
10 单词 29 32
11 单词 34 36
12 单词 38 40
13 单词 41 41
14 单词 43 44
15 单词 46 48
16 单词 50 51
17 单词 53 55
18 单词 57 62
19 单词 63 63

这是我想要的:

ID 类型 开始 结尾
1 句子 1 41
2 句子 43 63
1 单词 1 4
2 单词 6 12
3 单词 13 13
4 单词 15 15
5 单词 17 20
6 单词 22 23
7 单词 25 27
8 单词 29 32
9 单词 34 36
10 单词 38 40
11 单词 41 41
1 单词 43 44
2 单词 46 48
3 单词 50 51
4 单词 53 55
5 单词 57 62
6 单词 63 63

标签: rstringannotationsopennlp

解决方案


通过操作您的输入表:

library(dplyr)

r <- a[a$type == "sentence", "end"]
a$pos <- cut(a$start, breaks = c(1,r), include.lowest = TRUE, right = TRUE)
b <- a[a$type == "word",]
d <- b %>%
  group_by(pos) %>%
  mutate(id = 1:n())

bind_rows(a[a$type == "sentence",], d)

从头开始构建更全面的数据集:

# Split the string into sentences
string  <-  "Last morning, I went to the lake and sat. My dog is the cutest."
split <- unlist(strsplit(string, "(?<=\\.)", perl = TRUE))
split
#> [1] "Last morning, I went to the lake and sat."
#> [2] " My dog is the cutest."

# Split each sentence into words
# Retain punctuation and spaces as elements (using lookbehind)
# Each split sentence is saved as a sublist
a <- lapply(split, function(x) strsplit(x, split = "(?=\\s)|(?=[[:punct:]])", perl = TRUE))
b <- unlist(a, recursive = FALSE)
b
#> [[1]]
#>  [1] "Last"    " "       "morning" ","       " "       "I"       " "      
#>  [8] "went"    " "       "to"      " "       "the"     " "       "lake"   
#> [15] " "       "and"     " "       "sat"     "."      
#> 
#> [[2]]
#>  [1] " "      "My"     " "      "dog"    " "      "is"     " "      "the"   
#>  [9] " "      "cutest" "."

# Combine each sublist as a dataframe
df <- data.frame(word = c())
for(i in 1:length(b)){
  temp <- data.frame(b[[i]])
  names(temp) <- "word"
  temp$sentence <- i
  df <- rbind(df, temp)
}

# Get organized and calculate word positions
df$char <- nchar(df$word)
df$end <- ave(df$char, df$sentence, FUN=cumsum)
df$id <- ave(df$char, df$sentence, FUN=seq_along)
df$start <- df$end+1 - df$char
df$newid <- ave(df$char, df$sentence, FUN=seq_along)
df$constant <- 1
df$end_overall <- ave(df$char, df$constant, FUN=cumsum)
df$start_overall <- df$end_overall+1 - df$char

# Full dataset
df <- df[,c("sentence", "id", "word", "char", "start", "end", "start_overall", "end_overall")]
df
#>    sentence id    word char start end start_overall end_overall
#> 1         1  1    Last    4     1   4             1           4
#> 2         1  2            1     5   5             5           5
#> 3         1  3 morning    7     6  12             6          12
#> 4         1  4       ,    1    13  13            13          13
#> 5         1  5            1    14  14            14          14
#> 6         1  6       I    1    15  15            15          15
#> 7         1  7            1    16  16            16          16
#> 8         1  8    went    4    17  20            17          20
#> 9         1  9            1    21  21            21          21
#> 10        1 10      to    2    22  23            22          23
#> 11        1 11            1    24  24            24          24
#> 12        1 12     the    3    25  27            25          27
#> 13        1 13            1    28  28            28          28
#> 14        1 14    lake    4    29  32            29          32
#> 15        1 15            1    33  33            33          33
#> 16        1 16     and    3    34  36            34          36
#> 17        1 17            1    37  37            37          37
#> 18        1 18     sat    3    38  40            38          40
#> 19        1 19       .    1    41  41            41          41
#> 20        2  1            1     1   1            42          42
#> 21        2  2      My    2     2   3            43          44
#> 22        2  3            1     4   4            45          45
#> 23        2  4     dog    3     5   7            46          48
#> 24        2  5            1     8   8            49          49
#> 25        2  6      is    2     9  10            50          51
#> 26        2  7            1    11  11            52          52
#> 27        2  8     the    3    12  14            53          55
#> 28        2  9            1    15  15            56          56
#> 29        2 10  cutest    6    16  21            57          62
#> 30        2 11       .    1    22  22            63          63

# Or to match your expected result
df2 <- df[!(df$word %in% c(" ")),]
df2$newid <- ave(df2$char, df2$sentence, FUN=seq_along)
df2$constant <- 1
df2$end2 <- ave(df2$char, df2$constant, FUN=cumsum)
df2$start2 <- df2$end2+1 - df2$char
df2$type <- "word"
df3 <- rbind(df2[,c("type", "start", "end", "start_overall", "end_overall")],
             c("sentence", 1, 41, 1, 41),
             c("sentence", 43, 63, 43, 63))
df3
#>         type start end start_overall end_overall
#> 1       word     1   4             1           4
#> 3       word     6  12             6          12
#> 4       word    13  13            13          13
#> 6       word    15  15            15          15
#> 8       word    17  20            17          20
#> 10      word    22  23            22          23
#> 12      word    25  27            25          27
#> 14      word    29  32            29          32
#> 16      word    34  36            34          36
#> 18      word    38  40            38          40
#> 19      word    41  41            41          41
#> 21      word     2   3            43          44
#> 23      word     5   7            46          48
#> 25      word     9  10            50          51
#> 27      word    12  14            53          55
#> 29      word    16  21            57          62
#> 30      word    22  22            63          63
#> 181 sentence     1  41             1          41
#> 191 sentence    43  63            43          63
Created on 2021-10-22 by the reprex package (v2.0.1)

推荐阅读