commonmark · tats-u · Sep 9, 2024 · Sep 9, 2024
diff --git a/lib/inlines.js b/lib/inlines.js
@@ -37,6 +37,9 @@ var reHtmlTag = common.reHtmlTag;
 var rePunctuation = new RegExp(
     /^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\p{P}\p{S}]/u);
 
+var reCjk = /[\u2e80-\u4dbf\u4e00-\ua4cf\uf900-\ufaff\ufe10-\ufe1f\ufe30-\ufe6f\uff00-\uffee\u{1b000}-\u{1b16f}\u{20000}-\u{3ffff}\u{e0100}-\u{e01ef}]/u;
+var reSVSWithCjk = /[\ufe00-\ufe02\ufe0e]/u;
+
 var reLinkTitle = new RegExp(
     '^(?:"(' +
         ESCAPED_CHAR +
@@ -242,13 +245,31 @@ var parseHtmlTag = function(block) {
     }
 };
 
+var beforeRuneAndPos = function(str, pos) {
+    if (pos <= 0) {
+        return ["\n", 0];
+    }
+    var beforeCharCode = str.charCodeAt(pos - 1);
+    if (beforeCharCode < 0xdc00 || beforeCharCode > 0xdfff || pos === 1) {
+        // not surrogate pair or starts with lower surrogate
+        return [str.charAt(pos - 1), pos - 1];
+    }
+    var twoBeforeCharCode = str.charCodeAt(pos - 2);
+    if (twoBeforeCharCode < 0xd800 || twoBeforeCharCode > 0xdbff) {
+        // lonely low surrogate
+        return [str.charAt(pos - 1), pos - 1];
+    }
+    // valid surrogate pair
+    return [str.substring(pos - 2, pos), pos - 2];
+}
+
 // Scan a sequence of characters with code cc, and return information about
 // the number of delimiters and whether they are positioned such that
 // they can open and/or close emphasis or strong emphasis.  A utility
 // function for strong/emph parsing.
 var scanDelims = function(cc) {
     var numdelims = 0;
-    var char_before, char_after, cc_after;
+    var char_before, char_before_pos, char_two_before, char_after, cc_after;
     var startpos = this.pos;
     var left_flanking, right_flanking, can_open, can_close;
     var after_is_whitespace,
@@ -270,8 +291,17 @@ var scanDelims = function(cc) {
         return null;
     }
 
-    char_before = startpos === 0 ? "\n" : this.subject.charAt(startpos - 1);
-
+    [char_before, char_before_pos] = beforeRuneAndPos(this.subject, startpos);
+    // Seldom used, so use lazy evaluation
+    char_two_before = {
+        cached: undefined,
+        get(startpos, subject) {
+            if (this.cached === undefined) {
+                this.cached = beforeRuneAndPos(subject, startpos)[0];
+            }
+            return this.cached;
+        }
+    };
     cc_after = this.peek();
     if (cc_after === -1) {
         char_after = "\n";
@@ -283,15 +313,24 @@ var scanDelims = function(cc) {
     after_is_punctuation = rePunctuation.test(char_after);
     before_is_whitespace = reUnicodeWhitespaceChar.test(char_before);
     before_is_punctuation = rePunctuation.test(char_before);
+    var either_is_cjk =
+        reCjk.test(char_before) ||
+        reCjk.test(char_after) ||
+        (reSVSWithCjk.test(char_before) && reCjk.test(char_two_before.get(char_before_pos, this.subject)));
 
     left_flanking =
         !after_is_whitespace &&
         (!after_is_punctuation ||
             before_is_whitespace ||
-            before_is_punctuation);
+            before_is_punctuation ||
+            either_is_cjk);
     right_flanking =
         !before_is_whitespace &&
-        (!before_is_punctuation || after_is_whitespace || after_is_punctuation);
+        (!before_is_punctuation ||
+            after_is_whitespace ||
+            after_is_punctuation ||
+            either_is_cjk
+        );
     if (cc === C_UNDERSCORE) {
         can_open = left_flanking && (!right_flanking || before_is_punctuation);
         can_close = right_flanking && (!left_flanking || after_is_punctuation);

diff --git a/test/regression.txt b/test/regression.txt
@@ -60,7 +60,7 @@ Issue #108 - Chinese punctuation not recognized
 ```````````````````````````````` example
 **。**话
 .
-<p>**。**话</p>
+<p><strong>。</strong>话</p>
 ````````````````````````````````
 
 Issue jgm/cmark#177 - incorrect emphasis parsing
@@ -518,3 +518,18 @@ foo <!-- test --> more -->
 <p>foo <!-----></p>
 <p>foo <!-- test --> more --&gt;</p>
 ````````````````````````````````
+
+```````````````````````````````` example
+𠮷**(U+20BB7)**
+
+禰󠄀**(ね)**豆子
+
+福︀**(福)**祉︀**(祉)**
+
+**㊙︎**Top Secret**㊙︎**
+.
+<p>𠮷<strong>(U+20BB7)</strong></p>
+<p>禰󠄀<strong>(ね)</strong>豆子</p>
+<p>福︀<strong>(福)</strong>祉︀<strong>(祉)</strong></p>
+<p><strong>㊙︎</strong>Top Secret<strong>㊙︎</strong></p>
+````````````````````````````````