From 887617b43c80216a8ca9910ddc076f590e100d0d Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Tue, 10 Sep 2024 06:30:18 +0900 Subject: [PATCH 1/2] Add CJK handling --- lib/inlines.js | 49 ++++++++++++++++++++++++++++++++++++++++----- test/regression.txt | 2 +- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/lib/inlines.js b/lib/inlines.js index 1ecf8eef..c9e77d9c 100644 --- a/lib/inlines.js +++ b/lib/inlines.js @@ -37,6 +37,9 @@ var reHtmlTag = common.reHtmlTag; var rePunctuation = new RegExp( /^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\p{P}\p{S}]/u); +var reCjk = /[\u2e80-\u4dbf\u4e00-\ua4cf\uf900-\ufaff\ufe10-\ufe1f\ufe30-\ufe6f\uff00-\uffee\u{1b000}-\u{1b16f}\u{20000}-\u{3ffff}\u{e0100}-\u{e01ef}]/u; +var reSVSWithCjk = /[\ufe00-\ufe02\ufe0e]/u; + var reLinkTitle = new RegExp( '^(?:"(' + ESCAPED_CHAR + @@ -242,13 +245,31 @@ var parseHtmlTag = function(block) { } }; +var beforeRuneAndPos = function(str, pos) { + if (pos <= 0) { + return ["\n", 0]; + } + var beforeCharCode = str.charCodeAt(pos - 1); + if (beforeCharCode < 0xdc00 || beforeCharCode > 0xdfff || pos === 1) { + // not surrogate pair or starts with lower surrogate + return [str.charAt(pos - 1), pos - 1]; + } + var twoBeforeCharCode = str.charCodeAt(pos - 2); + if (twoBeforeCharCode < 0xd800 || twoBeforeCharCode > 0xdbff) { + // lonely low surrogate + return [str.charAt(pos - 1), pos - 1]; + } + // valid surrogate pair + return [str.substring(pos - 2, pos), pos - 2]; +} + // Scan a sequence of characters with code cc, and return information about // the number of delimiters and whether they are positioned such that // they can open and/or close emphasis or strong emphasis. A utility // function for strong/emph parsing. var scanDelims = function(cc) { var numdelims = 0; - var char_before, char_after, cc_after; + var char_before, char_before_pos, char_two_before, char_after, cc_after; var startpos = this.pos; var left_flanking, right_flanking, can_open, can_close; var after_is_whitespace, @@ -270,8 +291,17 @@ var scanDelims = function(cc) { return null; } - char_before = startpos === 0 ? "\n" : this.subject.charAt(startpos - 1); - + [char_before, char_before_pos] = beforeRuneAndPos(this.subject, startpos); + // Seldom used, so use lazy evaluation + char_two_before = { + cached: undefined, + get(startpos, subject) { + if (this.cached === undefined) { + this.cached = beforeRuneAndPos(subject, startpos)[0]; + } + return this.cached; + } + }; cc_after = this.peek(); if (cc_after === -1) { char_after = "\n"; @@ -283,15 +313,24 @@ var scanDelims = function(cc) { after_is_punctuation = rePunctuation.test(char_after); before_is_whitespace = reUnicodeWhitespaceChar.test(char_before); before_is_punctuation = rePunctuation.test(char_before); + var either_is_cjk = + reCjk.test(char_before) || + reCjk.test(char_after) || + (reSVSWithCjk.test(char_before) && reCjk.test(char_two_before.get(char_before_pos, this.subject))); left_flanking = !after_is_whitespace && (!after_is_punctuation || before_is_whitespace || - before_is_punctuation); + before_is_punctuation || + either_is_cjk); right_flanking = !before_is_whitespace && - (!before_is_punctuation || after_is_whitespace || after_is_punctuation); + (!before_is_punctuation || + after_is_whitespace || + after_is_punctuation || + either_is_cjk + ); if (cc === C_UNDERSCORE) { can_open = left_flanking && (!right_flanking || before_is_punctuation); can_close = right_flanking && (!left_flanking || after_is_punctuation); diff --git a/test/regression.txt b/test/regression.txt index 40630a7b..ef62b6c7 100644 --- a/test/regression.txt +++ b/test/regression.txt @@ -60,7 +60,7 @@ Issue #108 - Chinese punctuation not recognized ```````````````````````````````` example **。**话 . -

**。**话

+

```````````````````````````````` Issue jgm/cmark#177 - incorrect emphasis parsing From 726e9fb38f325bbcb9bcc6f18d7e8e4a4b51cb1f Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Tue, 10 Sep 2024 06:30:46 +0900 Subject: [PATCH 2/2] Some extra tests --- test/regression.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/regression.txt b/test/regression.txt index ef62b6c7..4e471537 100644 --- a/test/regression.txt +++ b/test/regression.txt @@ -518,3 +518,18 @@ foo more -->

foo

foo more -->

```````````````````````````````` + +```````````````````````````````` example +𠮷**(U+20BB7)** + +禰󠄀**(ね)**豆子 + +福︀**(福)**祉︀**(祉)** + +**㊙︎**Top Secret**㊙︎** +. +

𠮷(U+20BB7)

+

禰󠄀(ね)豆子

+

福︀(福)祉︀(祉)

+

㊙︎Top Secret㊙︎

+````````````````````````````````