javascript - 在 JavaScript 中计算文档中单词的开始和结束位置
问题描述
我有一个文本文档,表示为一个array
句子,对于每个句子我都有一个array
单词标记。
我必须为每个标记位置计算文档中标记位置的绝对开始和结束,因此如果在一个句子中我有ipsum
五次,我必须在每次出现时都获得该句子中的正确位置。
我写了这个函数
// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
var currentText = [];
textArray.sentences.forEach(function(sentence) {
for (var i = 0; i < sentence.tokens.length; ++i) {
var token = sentence.tokens[i];
var word = token.word;
if (i > 0) {
var thisBegin = token.characterOffsetBegin;
var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
if (thisBegin > previousEnd) {
currentText.push(' ');
}
}
token.characterOffsetBegin = currentText.length;
for (var j = 0; j < word.length; ++j) {
currentText.push(word[j]);
}
token.characterOffsetEnd = currentText.length;
}
currentText.push('\n');
});
return textArray;
} //calculateTokenBeginEnd
但是有问题。计算出来的characterOffsetBegin
和characterOffsetEnd
是错误的。文档具有以下结构
{
"sentences": [
{
"index": 0,
"text": "Lorem ipsum dolor sit amet,",
"tokens": [
{
"index": 1,
"word": "Lorem",
"characterOffsetBegin": 0,
"characterOffsetEnd": 5
},
{
"index": 2,
"word": "ipsum",
"characterOffsetBegin": 5,
"characterOffsetEnd": 10
},
...
]
},
{
"index": 1,
"text": " consectetur adipiscing elit,",
"tokens": [
{
"index": 1,
"word": "",
"characterOffsetBegin": 24,
"characterOffsetEnd": 24
},
...
}
这是使用此方法的示例。然后calculateTokenBeginEnd
应该计算令牌开始和结束索引,同时在text2SentencesTokens
上面创建文档结构。calculateTokenBeginEnd
没有按预期工作。
text = "Lorem ipsum dolor sit amet,\n consectetur adipiscing elit,\nsed do eiusmod tempor incididunt\nut labore et dolore magna aliqua.\nUt enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi\nut aliquip ex ea commodo consequat.\nDuis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur.\nExcepteur sint occaecat cupidatat non proident,\nLorem ipsum dolor sit amet etwas,\nsunt in culpa qui officia deserunt mollit anim id est laborum"
// to map a text to sentences and tokens
text2SentencesTokens = function(text) {
var self = this;
return new Promise((resolve, _) => {
let sentences = text.split(/\n+/g);
let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
return new Promise((resolve, _) => {
let tokens = sentence.split(/\s+/g);
let tokensP = tokens.map((token, tokenIndex) => { // for each token
let item = {
"index": (tokenIndex + 1),
"word": token
}
if (typeof(tokenP) == 'function') {
return tokenP.apply(self, [item]);
} else {
return new Promise((resolve, _) => {
resolve(item);
});
}
});
Promise.all(tokensP)
.then(res => {
resolve({
index: lineIndex,
text: sentence,
tokens: res
});
})
.catch(err => console.error(err))
});
});
Promise.all(sentencesP)
.then(res => {
resolve({
sentences: res
})
})
.catch(err => console.error(err))
});
} //text2SentencesTokens
// calculate begin and end to each token in a sentence
function calculateTokenBeginEnd(textArray) {
var currentText = [];
textArray.sentences.forEach(function(sentence) {
for (var i = 0; i < sentence.tokens.length; ++i) {
var token = sentence.tokens[i];
var word = token.word;
if (i > 0) {
var thisBegin = token.characterOffsetBegin;
var previousEnd = sentence.tokens[i - 1].characterOffsetEnd;
if (thisBegin > previousEnd) {
currentText.push(' ');
}
}
token.characterOffsetBegin = currentText.length;
for (var j = 0; j < word.length; ++j) {
currentText.push(word[j]);
}
token.characterOffsetEnd = currentText.length;
}
currentText.push('\n');
});
return textArray;
} //calculateTokenBeginEnd
text2SentencesTokens(text)
.then(sentences => {
sentences = calculateTokenBeginEnd(sentences);
console.log(sentences);
})
[更新]
根据建议,我将函数重写如下:
function calculateTokenBeginEnd(textArray) {
var wordStart=-1;
for (var j = 0; j < textArray.sentences.length; ++j) {
var sentence=textArray.sentences[j];
wordStart +=1;
for (var i = 0; i < sentence.tokens.length; ++i) {
var token = sentence.tokens[i];
var word = token.word;
var wordRegex = new RegExp("\\b(" + word + ")\\b", "gi");
var match = wordRegex.exec(sentence.text);
var previousEnd = 0;
wordStart += match.index + previousEnd;
var wordEnd = wordStart + word.length - 1;
token.characterOffsetBegin = wordStart;
token.characterOffsetEnd = wordEnd;
}
}
}//calculateTokenBeginEnd
有什么更好的解决方案吗?
[更新 2]
我已经text2SentencesTokens
根据建议的解决方案更新了。问题是,当一个或多个句子中存在多个相同的匹配时,此方案将无法正常工作token
,因为它会用最后匹配的位置覆盖开始和结束位置,所以down
这里的token会得到最后匹配的位置:
{
"index": 2,
"word": "down",
"characterOffsetBegin": 70,
"characterOffsetEnd": 73
}
在第一个句子的第一次出现,而它应该有第一个匹配的位置。
// convert a text document into a sentences array and a token array for each sentence
function text2SentencesTokens(text, tokenP) {
var self = this;
return new Promise((resolve, _) => {
let sentences = text.split(/\n+/g);
let sentencesP = sentences.map((sentence, lineIndex) => { // for each sentence
return new Promise((resolve, _) => {
let tokens = sentence.replace(/[\\+;:\?!\»\«\>\<\]\[\)\(,\.\‘'“”"]/g, '').split(/\s+/g);
let tokensP = tokens.map((token, tokenIndex) => { // for each token
let item = {
"index": (tokenIndex + 1),
"word": token
}
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
var wordRegex = RegExp("\\b(" + escaped + ")\\b", "g");
var match = null;
while ((match = wordRegex.exec(text)) !== null) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
}
if (typeof(tokenP) == 'function') {
return tokenP.apply(self, [item, sentence]);
} else {
return new Promise((resolve, _) => {
resolve(item);
});
}
});
Promise.all(tokensP)
.then(res => {
resolve({
index: lineIndex,
text: sentence,
tokens: res
});
})
.catch(err => console.error(err))
});
});
Promise.all(sentencesP)
.then(res => {
resolve({
sentences: res
})
})
.catch(err => console.error(err))
});
} //text2SentencesTokens
text = "Steve down walks warily down the street down\nWith the brim pulled way down low";
text2SentencesTokens(text)
.then(res => console.log(JSON.stringify(res, null, 2)))
解决方案
这可能是计算句子中单词开头/结尾的更简单方法,希望它会有所帮助
var word = "Lorem";
var reg = RegExp(word, 'g');
var sentence = "Lore ipsum Lorem dolor sit Lorem amet,";
var match;
console.log(sentence);
console.log(word);
while ((match = reg.exec(sentence)) !== null) {
var wordStart = match.index;
var wordEnd = wordStart + word.length - 1;
console.log(wordStart + ' -start index');
console.log(word.length + ' -length of word');
console.log(wordEnd + ' -last character index, need to +1 to use with substring');
console.log(sentence.substring(wordStart, wordEnd + 1) + '-using substring with calculated to find the word and verify');
}
推荐阅读
- php - 如何使用 php 7.3 在 html 页面上运行 php
- python - 如何自定义条形注释以不显示所选值
- python - PyTorch LSTM 正在预测训练数据集
- npm - 包含来自另一个文件的 package.json 依赖项
- javascript - 如何使用 Php 将新对象推送到 Js 中的数组中
- c# - Oracle 阻止 NET6 迁移 - 检测到超出依赖约束的包版本
- git - “git push --force”是否会推送存储库中的所有对象?
- python - 按列/行值匹配和合并python中2个单独数据帧的最佳方法?
- python - 将字符串变量从 Python 文件传递到 Kivy 文件
- node.js - ES6 动态模块导入 Express Routes