Detect exact words positions in text in JavaScript

问题

I have a text in which some words may repeat. I have to detect words occurrences for each word like:

{
        "index": 10,
        "word": "soul",
        "characterOffsetBegin": 1606,
        "characterOffsetEnd": 1609
}

I have implemented this approach that partially works

var seen = new Map();
tokens.forEach(token => { // for each token
    let item = {
        "word": token
    }
    var pattern = "\\b($1)\\b";
    var wordRegex = new RegExp(pattern.replace('$1', token), "g");

    // calculate token begin end 
    var match = null;
    while ((match = wordRegex.exec(text)) !== null) {
        if (match.index > (seen.get(token) || -1)) {
            var wordStart = match.index;
            var wordEnd = wordStart + token.length - 1;
            item.characterOffsetBegin = wordStart;
            item.characterOffsetEnd = wordEnd;

            seen.set(token, wordEnd);
            break;
        }
    }
});

This will work in most of cases as showed here:

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");
  // commas if followed by space
  text = text.replace(/(,\s)/g, " $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g, " $1");
  // single quotes if last char
  text = text.replace(/('$)/, " $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g, " . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}

var seen = new Map();
var text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

var tokens = aggressive_tokenizer(text);
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = "\\b($1)\\b";
  var wordRegex = new RegExp(pattern.replace('$1', token), "g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
  return item;
});
console.log(indexes);

There are some circumstances, where I have found out that the indexes are missing:

 var text = "'Lorem ipsum 'dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

Here I have added a "'" to some words: "'Lorem" and "'dolor" (That would be in english something like a contraction like "'Cause'", etc. Now it won't work as expected:

{
    "word": "'Lorem"
}

This is probably because of the pattern = "\\b($1)\\b";, that I'm using to exactly match the word to get the right begin and end char offsets, while the tokenizer will tokenize some text like 'Cause as 'Cause, so keeping the accent to further analyze this token (like for transforming 'cause in because in a NLP pipeline, hence I cannot remove the "'" from those tokens.

Another attempt is to use the regex

pattern = "(?<!\\S)$1(?!\\S)";

that works in the case of 'Lorem, but could fail in other cases.

来源：https://stackoverflow.com/questions/64032621/detect-exact-words-positions-in-text-in-javascript

标签

javascript

tokenize