diff --git a/src/tokenizer.js b/src/tokenizer.js index 501f9e4..a2a1b66 100644 --- a/src/tokenizer.js +++ b/src/tokenizer.js @@ -344,7 +344,7 @@ const TOKENIZE_RE = [ /\b([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dc$3$4$5_", // co.,ltd. /\b([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g, "_$1$2$3$4_", // Corp. and Co. /\b([Ll])([Tt])([Dd])[\.]/g, "_$1$2$3_", // ltd. - /\b(prof|Prof|PROF)[\.]/g, "_$1_", //Prof. + /\b(prof|Prof|PROF)\./g, "_$1_", //Prof. // /(\w+([\.-_]?\w+)*)@(\w+([\.-_]?\w+)*)\.(\w{2,3})/g, "$1__AT__$3.$5", //email addresses // /^\w+([\.-]?\w+)+@\w+([\.:]?\w+)+(\.[a-zA-Z0-9]{2,3})+$/g, "$1__AT__$2", //email addresses /\b([\w.]+)@(\w+\.\w+)/g, "$1__AT__$2", diff --git a/test/tokenizer-tests.js b/test/tokenizer-tests.js index 88bf875..0de1193 100644 --- a/test/tokenizer-tests.js +++ b/test/tokenizer-tests.js @@ -188,9 +188,15 @@ describe('Tokenizer', () => { expect(RiTa.tokenize("The programs.")).eql(["The", "programs", "."]); expect(RiTa.tokenize("The find.")).eql(["The", "find", "."]); expect(RiTa.tokenize("The bancor.", { debug: 0 })).eql(["The", "bancor", "."]); + expect(RiTa.tokenize("The prof. ate.", { debug: 0 })).eql(["The", "prof.", "ate", "."]); let input, expected, output; + input = "According to the prof. climate change was real."; + expected = ['According', 'to', 'the', 'prof.', 'climate', 'change', 'was', 'real', '.']; + output = RiTa.tokenize(input); + expect(output).eql(expected); + input = "The student said 'learning is fun'"; expected = ["The", "student", "said", "'", "learning", "is", "fun", "'"]; output = RiTa.tokenize(input);