r - 注释句子中的单词成分
问题描述
我正在使用 openNLP 来注释整个文本中句子中的单词。作为最终结果,我希望单词 ID 匹配它们在每个句子中的顺序,每次输入新句子时顺序从 1 开始)。这是我到目前为止所拥有的:
#create string
string <- paste0("Last morning, I went to the lake and sat. My dog is the cutest.")
ex_string <- as.String(string)
#annotate words and sentences
init_s_w <- annotate(ex_string, list(Maxent_Sent_Token_Annotator(probs=TRUE),
Maxent_Word_Token_Annotator(probs=TRUE)))
init_s_w
ID | 类型 | 开始 | 结尾 |
---|---|---|---|
1 | 句子 | 1 | 41 |
2 | 句子 | 43 | 63 |
3 | 单词 | 1 | 4 |
4 | 单词 | 6 | 12 |
5 | 单词 | 13 | 13 |
6 | 单词 | 15 | 15 |
7 | 单词 | 17 | 20 |
8 | 单词 | 22 | 23 |
9 | 单词 | 25 | 27 |
10 | 单词 | 29 | 32 |
11 | 单词 | 34 | 36 |
12 | 单词 | 38 | 40 |
13 | 单词 | 41 | 41 |
14 | 单词 | 43 | 44 |
15 | 单词 | 46 | 48 |
16 | 单词 | 50 | 51 |
17 | 单词 | 53 | 55 |
18 | 单词 | 57 | 62 |
19 | 单词 | 63 | 63 |
这是我想要的:
ID | 类型 | 开始 | 结尾 |
---|---|---|---|
1 | 句子 | 1 | 41 |
2 | 句子 | 43 | 63 |
1 | 单词 | 1 | 4 |
2 | 单词 | 6 | 12 |
3 | 单词 | 13 | 13 |
4 | 单词 | 15 | 15 |
5 | 单词 | 17 | 20 |
6 | 单词 | 22 | 23 |
7 | 单词 | 25 | 27 |
8 | 单词 | 29 | 32 |
9 | 单词 | 34 | 36 |
10 | 单词 | 38 | 40 |
11 | 单词 | 41 | 41 |
1 | 单词 | 43 | 44 |
2 | 单词 | 46 | 48 |
3 | 单词 | 50 | 51 |
4 | 单词 | 53 | 55 |
5 | 单词 | 57 | 62 |
6 | 单词 | 63 | 63 |
解决方案
通过操作您的输入表:
library(dplyr)
r <- a[a$type == "sentence", "end"]
a$pos <- cut(a$start, breaks = c(1,r), include.lowest = TRUE, right = TRUE)
b <- a[a$type == "word",]
d <- b %>%
group_by(pos) %>%
mutate(id = 1:n())
bind_rows(a[a$type == "sentence",], d)
从头开始构建更全面的数据集:
# Split the string into sentences
string <- "Last morning, I went to the lake and sat. My dog is the cutest."
split <- unlist(strsplit(string, "(?<=\\.)", perl = TRUE))
split
#> [1] "Last morning, I went to the lake and sat."
#> [2] " My dog is the cutest."
# Split each sentence into words
# Retain punctuation and spaces as elements (using lookbehind)
# Each split sentence is saved as a sublist
a <- lapply(split, function(x) strsplit(x, split = "(?=\\s)|(?=[[:punct:]])", perl = TRUE))
b <- unlist(a, recursive = FALSE)
b
#> [[1]]
#> [1] "Last" " " "morning" "," " " "I" " "
#> [8] "went" " " "to" " " "the" " " "lake"
#> [15] " " "and" " " "sat" "."
#>
#> [[2]]
#> [1] " " "My" " " "dog" " " "is" " " "the"
#> [9] " " "cutest" "."
# Combine each sublist as a dataframe
df <- data.frame(word = c())
for(i in 1:length(b)){
temp <- data.frame(b[[i]])
names(temp) <- "word"
temp$sentence <- i
df <- rbind(df, temp)
}
# Get organized and calculate word positions
df$char <- nchar(df$word)
df$end <- ave(df$char, df$sentence, FUN=cumsum)
df$id <- ave(df$char, df$sentence, FUN=seq_along)
df$start <- df$end+1 - df$char
df$newid <- ave(df$char, df$sentence, FUN=seq_along)
df$constant <- 1
df$end_overall <- ave(df$char, df$constant, FUN=cumsum)
df$start_overall <- df$end_overall+1 - df$char
# Full dataset
df <- df[,c("sentence", "id", "word", "char", "start", "end", "start_overall", "end_overall")]
df
#> sentence id word char start end start_overall end_overall
#> 1 1 1 Last 4 1 4 1 4
#> 2 1 2 1 5 5 5 5
#> 3 1 3 morning 7 6 12 6 12
#> 4 1 4 , 1 13 13 13 13
#> 5 1 5 1 14 14 14 14
#> 6 1 6 I 1 15 15 15 15
#> 7 1 7 1 16 16 16 16
#> 8 1 8 went 4 17 20 17 20
#> 9 1 9 1 21 21 21 21
#> 10 1 10 to 2 22 23 22 23
#> 11 1 11 1 24 24 24 24
#> 12 1 12 the 3 25 27 25 27
#> 13 1 13 1 28 28 28 28
#> 14 1 14 lake 4 29 32 29 32
#> 15 1 15 1 33 33 33 33
#> 16 1 16 and 3 34 36 34 36
#> 17 1 17 1 37 37 37 37
#> 18 1 18 sat 3 38 40 38 40
#> 19 1 19 . 1 41 41 41 41
#> 20 2 1 1 1 1 42 42
#> 21 2 2 My 2 2 3 43 44
#> 22 2 3 1 4 4 45 45
#> 23 2 4 dog 3 5 7 46 48
#> 24 2 5 1 8 8 49 49
#> 25 2 6 is 2 9 10 50 51
#> 26 2 7 1 11 11 52 52
#> 27 2 8 the 3 12 14 53 55
#> 28 2 9 1 15 15 56 56
#> 29 2 10 cutest 6 16 21 57 62
#> 30 2 11 . 1 22 22 63 63
# Or to match your expected result
df2 <- df[!(df$word %in% c(" ")),]
df2$newid <- ave(df2$char, df2$sentence, FUN=seq_along)
df2$constant <- 1
df2$end2 <- ave(df2$char, df2$constant, FUN=cumsum)
df2$start2 <- df2$end2+1 - df2$char
df2$type <- "word"
df3 <- rbind(df2[,c("type", "start", "end", "start_overall", "end_overall")],
c("sentence", 1, 41, 1, 41),
c("sentence", 43, 63, 43, 63))
df3
#> type start end start_overall end_overall
#> 1 word 1 4 1 4
#> 3 word 6 12 6 12
#> 4 word 13 13 13 13
#> 6 word 15 15 15 15
#> 8 word 17 20 17 20
#> 10 word 22 23 22 23
#> 12 word 25 27 25 27
#> 14 word 29 32 29 32
#> 16 word 34 36 34 36
#> 18 word 38 40 38 40
#> 19 word 41 41 41 41
#> 21 word 2 3 43 44
#> 23 word 5 7 46 48
#> 25 word 9 10 50 51
#> 27 word 12 14 53 55
#> 29 word 16 21 57 62
#> 30 word 22 22 63 63
#> 181 sentence 1 41 1 41
#> 191 sentence 43 63 43 63
Created on 2021-10-22 by the reprex package (v2.0.1)
推荐阅读
- javascript - Lottie Animation 不使用 ejs 加载,而是使用标准 html 加载
- wordpress - 获悉新的漏洞利用
- javascript - 播放视频时 Safari 打开新窗口
- python - Tensorflow 中的 dropout 层会影响哪些层?
- batch-file - 该文件已用批处理文件重命名,但不会停止
- javascript - 我如何在反应原生的函数中设置映射数据的样式
- java - 查找字符串中 LR 的所有组合以遍历数据结构
- html - 边框不占用整个表格宽度
- r - 修改“geom_violin”以绘制直方图而不是密度?
- swift - 在第一个 func 之后执行 func