diff --git a/lib/inlines.js b/lib/inlines.js index 1ecf8eef..c9e77d9c 100644 --- a/lib/inlines.js +++ b/lib/inlines.js @@ -37,6 +37,9 @@ var reHtmlTag = common.reHtmlTag; var rePunctuation = new RegExp( /^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\p{P}\p{S}]/u); +var reCjk = /[\u2e80-\u4dbf\u4e00-\ua4cf\uf900-\ufaff\ufe10-\ufe1f\ufe30-\ufe6f\uff00-\uffee\u{1b000}-\u{1b16f}\u{20000}-\u{3ffff}\u{e0100}-\u{e01ef}]/u; +var reSVSWithCjk = /[\ufe00-\ufe02\ufe0e]/u; + var reLinkTitle = new RegExp( '^(?:"(' + ESCAPED_CHAR + @@ -242,13 +245,31 @@ var parseHtmlTag = function(block) { } }; +var beforeRuneAndPos = function(str, pos) { + if (pos <= 0) { + return ["\n", 0]; + } + var beforeCharCode = str.charCodeAt(pos - 1); + if (beforeCharCode < 0xdc00 || beforeCharCode > 0xdfff || pos === 1) { + // not surrogate pair or starts with lower surrogate + return [str.charAt(pos - 1), pos - 1]; + } + var twoBeforeCharCode = str.charCodeAt(pos - 2); + if (twoBeforeCharCode < 0xd800 || twoBeforeCharCode > 0xdbff) { + // lonely low surrogate + return [str.charAt(pos - 1), pos - 1]; + } + // valid surrogate pair + return [str.substring(pos - 2, pos), pos - 2]; +} + // Scan a sequence of characters with code cc, and return information about // the number of delimiters and whether they are positioned such that // they can open and/or close emphasis or strong emphasis. A utility // function for strong/emph parsing. var scanDelims = function(cc) { var numdelims = 0; - var char_before, char_after, cc_after; + var char_before, char_before_pos, char_two_before, char_after, cc_after; var startpos = this.pos; var left_flanking, right_flanking, can_open, can_close; var after_is_whitespace, @@ -270,8 +291,17 @@ var scanDelims = function(cc) { return null; } - char_before = startpos === 0 ? "\n" : this.subject.charAt(startpos - 1); - + [char_before, char_before_pos] = beforeRuneAndPos(this.subject, startpos); + // Seldom used, so use lazy evaluation + char_two_before = { + cached: undefined, + get(startpos, subject) { + if (this.cached === undefined) { + this.cached = beforeRuneAndPos(subject, startpos)[0]; + } + return this.cached; + } + }; cc_after = this.peek(); if (cc_after === -1) { char_after = "\n"; @@ -283,15 +313,24 @@ var scanDelims = function(cc) { after_is_punctuation = rePunctuation.test(char_after); before_is_whitespace = reUnicodeWhitespaceChar.test(char_before); before_is_punctuation = rePunctuation.test(char_before); + var either_is_cjk = + reCjk.test(char_before) || + reCjk.test(char_after) || + (reSVSWithCjk.test(char_before) && reCjk.test(char_two_before.get(char_before_pos, this.subject))); left_flanking = !after_is_whitespace && (!after_is_punctuation || before_is_whitespace || - before_is_punctuation); + before_is_punctuation || + either_is_cjk); right_flanking = !before_is_whitespace && - (!before_is_punctuation || after_is_whitespace || after_is_punctuation); + (!before_is_punctuation || + after_is_whitespace || + after_is_punctuation || + either_is_cjk + ); if (cc === C_UNDERSCORE) { can_open = left_flanking && (!right_flanking || before_is_punctuation); can_close = right_flanking && (!left_flanking || after_is_punctuation); diff --git a/test/regression.txt b/test/regression.txt index 40630a7b..4e471537 100644 --- a/test/regression.txt +++ b/test/regression.txt @@ -60,7 +60,7 @@ Issue #108 - Chinese punctuation not recognized ```````````````````````````````` example **。**话 . -
**。**话
+。话
```````````````````````````````` Issue jgm/cmark#177 - incorrect emphasis parsing @@ -518,3 +518,18 @@ foo more -->foo
foo more -->
```````````````````````````````` + +```````````````````````````````` example +𠮷**(U+20BB7)** + +禰󠄀**(ね)**豆子 + +福︀**(福)**祉︀**(祉)** + +**㊙︎**Top Secret**㊙︎** +. +𠮷(U+20BB7)
+禰󠄀(ね)豆子
+福︀(福)祉︀(祉)
+㊙︎Top Secret㊙︎
+````````````````````````````````