From cfc14af7589679c18406c2d3ab0bcc0e8dbe6a35 Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:46:20 +0100
Subject: [PATCH 1/8] update description

---
 jsregexp-0.0.7-1.rockspec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jsregexp-0.0.7-1.rockspec b/jsregexp-0.0.7-1.rockspec
index b71a9d9..62b97ad 100644
--- a/jsregexp-0.0.7-1.rockspec
+++ b/jsregexp-0.0.7-1.rockspec
@@ -8,7 +8,7 @@ source = {
 description = {
 	summary = "javascript (ECMA19) regular expressions for lua",
 	detailed = [[
-WIP: This library offers a single function to use javascript regular expressions in lua. It makes use of libregexp from https://bellard.org/quickjs/.
+Provides ECMAScript regular expressions for Lua 5.1, 5.2, 5.3, 5.4 and LuaJit. Uses libregexp from Fabrice Bellard's QuickJS.
 	]],
 	homepage = "https://github.com/kmarius/jsregexp",
 	license = "MIT",

From 331d4be26162d651be406b193b8465779e84e5bd Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Tue, 25 Jun 2024 15:08:17 +0200
Subject: [PATCH 2/8] add .clang-format

---
 .clang-format | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..be9d1eb
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,224 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    true
+AlignConsecutiveBitFields:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    false
+AlignConsecutiveDeclarations:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    false
+AlignConsecutiveMacros:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  PadOperators:    false
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind:            Always
+  OverEmptyLines:  0
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: false
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterExternBlock: false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakAfterAttributes: Never
+BreakAfterJavaFieldAnnotations: false
+BreakArrays:     true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: Always
+BreakBeforeBraces: Attach
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentRequiresClause: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertBraces:    false
+InsertNewlineAtEOF: false
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary:          0
+  BinaryMinDigits: 0
+  Decimal:         0
+  DecimalMinDigits: 0
+  Hex:             0
+  HexMinDigits:    0
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+LineEnding:      DeriveLF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PackConstructorInitializers: BinPack
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+PPIndentWidth:   -1
+QualifierAlignment: Leave
+ReferenceAlignment: Pointer
+ReflowComments:  true
+RemoveBracesLLVM: false
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - BOOST_PP_STRINGIZE
+  - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
+...

From 4b7e242dbdee135af6d7b4d90d60ca2ed71a1352 Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Tue, 25 Jun 2024 15:09:47 +0200
Subject: [PATCH 3/8] formatting

---
 jsregexp.c   | 276 +++++++++++++++++++++++------------------------
 jsregexp.lua | 294 ++++++++++++++++++++++++++++-----------------------
 test.lua     | 210 +++++++++++++++++++++++++-----------
 3 files changed, 444 insertions(+), 336 deletions(-)

diff --git a/jsregexp.c b/jsregexp.c
index b5f8eff..305256d 100644
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -10,12 +10,11 @@
 #include "libregexp/cutils.h"
 #include "libregexp/libregexp.h"
 
-#define CAPTURE_COUNT_MAX 255  /* from libregexp.c */
+#define CAPTURE_COUNT_MAX 255 /* from libregexp.c */
 #define JSREGEXP_MT "jsregexp_meta"
 #define JSREGEXP_MATCH_MT "jsregexp_match_meta"
 #define JSSTRING_MT "jsstring_meta"
 
-
 #if LUA_VERSION_NUM >= 502
 #define new_lib(L, l) (luaL_newlib(L, l))
 #define lua_tbl_len(L, arg) (lua_rawlen(L, arg))
@@ -33,7 +32,7 @@
 #define streq(X, Y) ((*(X) == *(Y)) && strcmp(X, Y) == 0)
 
 struct regexp {
-  char* expr;
+  char *expr;
   uint8_t *bc;
   uint32_t last_index;
 };
@@ -41,21 +40,19 @@ struct regexp {
 struct jsstring {
   bool is_wide_char;
   uint32_t len;
-  char* bstr; // base string passed in
+  char *bstr;        // base string passed in
   uint32_t bstr_len; // base string length
-  uint32_t* indices;
-  uint32_t* rev_indices;
+  uint32_t *indices;
+  uint32_t *rev_indices;
   union {
-      uint8_t* str8; /* 8 bit strings will get an extra null terminator */
-      uint16_t* str16;
+    uint8_t *str8; /* 8 bit strings will get an extra null terminator */
+    uint16_t *str16;
   } u;
 };
 
-
 // check for bytes higher or equal to 0xf0
-static inline bool utf8_contains_non_bmp(const char *s)
-{
-  uint8_t *q = (uint8_t *) s;
+static inline bool utf8_contains_non_bmp(const char *s) {
+  uint8_t *q = (uint8_t *)s;
   while (*q) {
     if ((*q++ & 0xf0) == 0xf0) {
       return true;
@@ -64,9 +61,7 @@ static inline bool utf8_contains_non_bmp(const char *s)
   return false;
 }
 
-
-static inline bool utf8_contains_non_ascii(const char *s)
-{
+static inline bool utf8_contains_non_ascii(const char *s) {
   while (*s) {
     if (*s++ & 0x80) {
       return true;
@@ -75,27 +70,22 @@ static inline bool utf8_contains_non_ascii(const char *s)
   return false;
 }
 
-
 // returns NULL when malformed unicode is encountered, otherwise returns the
 // converted string. *utf16_len will contain the length of the string and
 // *indices an (allocated) array mapping each utf16 code point to the utf8 code
 // point in the input string.
-static inline uint16_t *utf8_to_utf16(
-    const uint8_t *input,
-    uint32_t n,
-    uint32_t *utf16_len,
-    uint32_t **indices,
-    uint32_t **rev_indices)
-{
-  *indices = calloc((n+1), sizeof **indices);
+static inline uint16_t *utf8_to_utf16(const uint8_t *input, uint32_t n,
+                                      uint32_t *utf16_len, uint32_t **indices,
+                                      uint32_t **rev_indices) {
+  *indices = calloc((n + 1), sizeof **indices);
   // TODO: lazy way of doing it, later implement using binary search tree
-  *rev_indices = calloc((n+1), sizeof **indices);
-  uint16_t *str = malloc((n+1) * sizeof *str);
+  *rev_indices = calloc((n + 1), sizeof **indices);
+  uint16_t *str = malloc((n + 1) * sizeof *str);
   uint16_t *q = str;
   const uint8_t *pos = input;
   while (*pos) {
-    (*indices)[q-str] = pos - input;
-    (*rev_indices)[pos - input] = q-str;
+    (*indices)[q - str] = pos - input;
+    (*rev_indices)[pos - input] = q - str;
     int c = unicode_from_utf8(pos, UTF8_CHAR_LEN_MAX, &pos);
     if (c == -1) {
       // malformed
@@ -104,7 +94,7 @@ static inline uint16_t *utf8_to_utf16(
       free(*rev_indices);
       return NULL;
     }
-    if ((unsigned) c > 0xffff) {
+    if ((unsigned)c > 0xffff) {
       *q++ = (((c - 0x10000) >> 10) | (0xd8 << 8));
       *q++ = (c & 0xfffff) | (0xdc << 8);
     } else {
@@ -119,22 +109,22 @@ static inline uint16_t *utf8_to_utf16(
   return str;
 }
 
-
-static int jsstring_new(lua_State* lstate) {
-  if(lua_isuserdata(lstate, 1)) {
+static int jsstring_new(lua_State *lstate) {
+  if (lua_isuserdata(lstate, 1)) {
     luaL_checkudata(lstate, 1, JSSTRING_MT);
     lua_pushvalue(lstate, 1);
     return 1;
   }
 
   size_t input_len;
-  const uint8_t* input = (uint8_t*)luaL_checklstring(lstate, 1, &input_len);
-  struct jsstring* ud;
-  if (utf8_contains_non_ascii((char *) input)) {
+  const uint8_t *input = (uint8_t *)luaL_checklstring(lstate, 1, &input_len);
+  struct jsstring *ud;
+  if (utf8_contains_non_ascii((char *)input)) {
     uint32_t *indices;
     uint32_t *rev_indices;
     uint32_t input_utf16_len;
-    uint16_t *input_utf16 = utf8_to_utf16(input, input_len, &input_utf16_len, &indices, &rev_indices);
+    uint16_t *input_utf16 = utf8_to_utf16(input, input_len, &input_utf16_len,
+                                          &indices, &rev_indices);
 
     if (!input_utf16) {
       luaL_error(lstate, "malformed unicode");
@@ -144,7 +134,7 @@ static int jsstring_new(lua_State* lstate) {
     ud->is_wide_char = true;
     ud->len = input_utf16_len;
     ud->u.str16 = input_utf16;
-    ud->bstr = strdup((char*)input);
+    ud->bstr = strdup((char *)input);
     ud->bstr_len = input_len;
     ud->indices = indices;
     ud->rev_indices = rev_indices;
@@ -153,8 +143,8 @@ static int jsstring_new(lua_State* lstate) {
     ud->is_wide_char = false;
     ud->len = input_len;
     ud->bstr_len = input_len;
-    ud->u.str8 =(uint8_t*) strdup((char*)input);
-    ud->bstr = (char*)ud->u.str8;
+    ud->u.str8 = (uint8_t *)strdup((char *)input);
+    ud->bstr = (char *)ud->u.str8;
     ud->indices = NULL;
     ud->rev_indices = NULL;
   }
@@ -163,7 +153,7 @@ static int jsstring_new(lua_State* lstate) {
   return 1;
 }
 
-static int jsstring_gc(lua_State* lstate) {
+static int jsstring_gc(lua_State *lstate) {
   struct jsstring *s = lua_touserdata(lstate, 1);
   free(s->u.str8);
   free(s->indices);
@@ -175,26 +165,22 @@ static int jsstring_gc(lua_State* lstate) {
   return 0;
 }
 
-static struct luaL_Reg jsstring_meta[] = {
-  {"__gc", jsstring_gc},
-  {NULL, NULL}
-};
+static struct luaL_Reg jsstring_meta[] = {{"__gc", jsstring_gc}, {NULL, NULL}};
 
-static inline struct jsstring* lua_tojsstring(lua_State *lstate, int arg) {
+static inline struct jsstring *lua_tojsstring(lua_State *lstate, int arg) {
   if (lua_isuserdata(lstate, arg)) {
     // already jsstring
-    return (struct jsstring*) luaL_checkudata(lstate, arg, JSSTRING_MT);
+    return (struct jsstring *)luaL_checkudata(lstate, arg, JSSTRING_MT);
   } else {
     // coerce to jsstring
     lua_pushcfunction(lstate, jsstring_new);
     lua_insert(lstate, arg);
     lua_call(lstate, 1, 1);
-    return (struct jsstring*) luaL_checkudata(lstate, arg, JSSTRING_MT);
+    return (struct jsstring *)luaL_checkudata(lstate, arg, JSSTRING_MT);
   }
 }
 
-static int regexp_call(lua_State *lstate)
-{
+static int regexp_call(lua_State *lstate) {
   uint8_t *capture[CAPTURE_COUNT_MAX * 2];
 
   struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT);
@@ -202,17 +188,19 @@ static int regexp_call(lua_State *lstate)
   const int named_groups = lre_get_flags(r->bc) & LRE_FLAG_NAMED_GROUPS;
   const int capture_count = lre_get_capture_count(r->bc);
 
-  struct jsstring* input = lua_tojsstring(lstate, 2);
+  struct jsstring *input = lua_tojsstring(lstate, 2);
 
   int nmatch = 0;
   int cindex = 0;
 
   if (input->is_wide_char) {
     lua_newtable(lstate);
-    while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 1, NULL) == 1) {
+    while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 1,
+                    NULL) == 1) {
       if (capture[0] == capture[1]) {
-        // empty match -> continue matching from next character (to prevent an endless loop).
-        // This is basically the same implementation as in quickjs, see
+        // empty match -> continue matching from next character (to prevent an
+        // endless loop). This is basically the same implementation as in
+        // quickjs, see
         // https://github.com/bellard/quickjs/blob/2788d71e823b522b178db3b3660ce93689534e6d/quickjs.c#L42857-L42869
 
         cindex++;
@@ -221,12 +209,13 @@ static int regexp_call(lua_State *lstate)
           cindex++;
         }
       } else {
-        cindex = (capture[1] - (uint8_t *) input->u.str16) / 2;
+        cindex = (capture[1] - (uint8_t *)input->u.str16) / 2;
       }
 
       lua_newtable(lstate);
 
-      lua_pushnumber(lstate, 1 + input->indices[(capture[0] - input->u.str8) / 2]);
+      lua_pushnumber(lstate,
+                     1 + input->indices[(capture[0] - input->u.str8) / 2]);
       lua_setfield(lstate, -2, "begin_ind");
 
       lua_pushnumber(lstate, input->indices[(capture[1] - input->u.str8) / 2]);
@@ -234,21 +223,23 @@ static int regexp_call(lua_State *lstate)
 
       lua_newtable(lstate);
 
-      const char* group_names = NULL;
+      const char *group_names = NULL;
       if (named_groups) {
         lua_newtable(lstate);
         group_names = lre_get_groupnames(r->bc);
       }
       for (int i = 1; i < capture_count; i++) {
-        const uint32_t a = input->indices[(capture[2*i] - input->u.str8) / 2];
-        const uint32_t b = input->indices[(capture[2*i+1] - input->u.str8) / 2];
-        lua_pushlstring(lstate, input->bstr+a, b-a);
+        const uint32_t a = input->indices[(capture[2 * i] - input->u.str8) / 2];
+        const uint32_t b =
+            input->indices[(capture[2 * i + 1] - input->u.str8) / 2];
+        lua_pushlstring(lstate, input->bstr + a, b - a);
         lua_rawseti(lstate, -2, i);
         if (named_groups && group_names != NULL) {
           if (*group_names != '\0') { // check if current group is named
-            lua_pushlstring(lstate, input->bstr+a, b-a);
+            lua_pushlstring(lstate, input->bstr + a, b - a);
             lua_setfield(lstate, -3, group_names);
-            group_names += strlen(group_names) + 1;  // move to the next group name
+            group_names +=
+                strlen(group_names) + 1; // move to the next group name
           } else {
             group_names += 1; // move to the next group name
           }
@@ -270,7 +261,8 @@ static int regexp_call(lua_State *lstate)
     }
   } else {
     lua_newtable(lstate);
-    while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 0, NULL) == 1) {
+    while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 0,
+                    NULL) == 1) {
       if (capture[0] == capture[1]) {
         cindex++;
       } else {
@@ -287,19 +279,22 @@ static int regexp_call(lua_State *lstate)
 
       lua_newtable(lstate);
 
-      const char* group_names = NULL;
+      const char *group_names = NULL;
       if (named_groups) {
         lua_newtable(lstate);
         group_names = lre_get_groupnames(r->bc);
       }
       for (int i = 1; i < capture_count; i++) {
-        lua_pushlstring(lstate, (char *) capture[2 * i], capture[2 * i + 1] - capture[2 * i]);
+        lua_pushlstring(lstate, (char *)capture[2 * i],
+                        capture[2 * i + 1] - capture[2 * i]);
         lua_rawseti(lstate, -2, i);
         if (named_groups && group_names != NULL) {
           if (*group_names != '\0') { // check if current group is named
-            lua_pushlstring(lstate, (char *) capture[2 * i], capture[2 * i + 1] - capture[2 * i]);
+            lua_pushlstring(lstate, (char *)capture[2 * i],
+                            capture[2 * i + 1] - capture[2 * i]);
             lua_setfield(lstate, -3, group_names);
-            group_names += strlen(group_names) + 1;  // move to the next group name
+            group_names +=
+                strlen(group_names) + 1; // move to the next group name
           } else {
             group_names += 1; // move to the next group name
           }
@@ -323,29 +318,27 @@ static int regexp_call(lua_State *lstate)
   return 1;
 }
 
-
-static int regexp_gc(lua_State *lstate)
-{
+static int regexp_gc(lua_State *lstate) {
   struct regexp *r = lua_touserdata(lstate, 1);
   free(r->bc);
   free(r->expr);
   return 0;
 }
 
-static void regexp_pushflags(lua_State* lstate, const struct regexp *r) {
+static void regexp_pushflags(lua_State *lstate, const struct regexp *r) {
   const int flags = lre_get_flags(r->bc);
-  const char* ignorecase = (flags & LRE_FLAG_IGNORECASE) ? "i" : "";
-  const char* global = (flags & LRE_FLAG_GLOBAL) ? "g" : "";
-  const char* multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : "";
-  const char* named_groups = (flags & LRE_FLAG_NAMED_GROUPS) ? "n" : "";
-  const char* dotall = (flags & LRE_FLAG_DOTALL) ? "s" : "";
-  const char* utf16 = (flags & LRE_FLAG_UTF16) ? "u" : "";
-  const char* sticky = (flags & LRE_FLAG_STICKY) ? "y" : "";
-  lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline, named_groups, dotall, utf16, sticky);
+  const char *ignorecase = (flags & LRE_FLAG_IGNORECASE) ? "i" : "";
+  const char *global = (flags & LRE_FLAG_GLOBAL) ? "g" : "";
+  const char *multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : "";
+  const char *named_groups = (flags & LRE_FLAG_NAMED_GROUPS) ? "n" : "";
+  const char *dotall = (flags & LRE_FLAG_DOTALL) ? "s" : "";
+  const char *utf16 = (flags & LRE_FLAG_UTF16) ? "u" : "";
+  const char *sticky = (flags & LRE_FLAG_STICKY) ? "y" : "";
+  lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline,
+                  named_groups, dotall, utf16, sticky);
 }
 
-static int regexp_tostring(lua_State *lstate)
-{
+static int regexp_tostring(lua_State *lstate) {
   const struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT);
   lua_pushfstring(lstate, "/%s/", r->expr);
   regexp_pushflags(lstate, r);
@@ -353,28 +346,24 @@ static int regexp_tostring(lua_State *lstate)
   return 1;
 }
 
-
 // automatic conversion to the global match string
-static int match_tostring(lua_State *lstate)
-{
-  //luaL_getmetatable(lstate, JSREGEXP_MATCH);
-  //if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) {
-  //  luaL_argerror(lstate, 1, "match object expected");
-  //}
+static int match_tostring(lua_State *lstate) {
+  // luaL_getmetatable(lstate, JSREGEXP_MATCH);
+  // if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) {
+  //   luaL_argerror(lstate, 1, "match object expected");
+  // }
   lua_rawgeti(lstate, 1, 0);
   return 1;
 }
 
-
 // repeatedly running regexp:match(input) is not a good idea because we would
 // convert the string (at least from last_ind) to utf16 every time (if it is
 // needed)
-static int regexp_exec(lua_State *lstate)
-{
+static int regexp_exec(lua_State *lstate) {
   uint8_t *capture[CAPTURE_COUNT_MAX * 2];
 
   struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT);
-  const struct jsstring* input = lua_tojsstring(lstate, 2);
+  const struct jsstring *input = lua_tojsstring(lstate, 2);
 
   const int global = lre_get_flags(r->bc) & LRE_FLAG_GLOBAL;
   const int sticky = lre_get_flags(r->bc) & LRE_FLAG_STICKY;
@@ -396,10 +385,11 @@ static int regexp_exec(lua_State *lstate)
   }
 
   const int capture_count = lre_get_capture_count(r->bc);
-  const char* group_names = lre_get_groupnames(r->bc);
+  const char *group_names = lre_get_groupnames(r->bc);
 
-  const int ret = lre_exec(capture, r->bc, (uint8_t *) input->u.str8, rlast_index,
-      input->len, input->is_wide_char ? 1 : 0, NULL);
+  const int ret =
+      lre_exec(capture, r->bc, (uint8_t *)input->u.str8, rlast_index,
+               input->len, input->is_wide_char ? 1 : 0, NULL);
 
   if (ret < 0) {
     luaL_error(lstate, "out of memory in regexp execution");
@@ -411,7 +401,7 @@ static int regexp_exec(lua_State *lstate)
       r->last_index = 0;
     }
     return 0;
-  } else  if (global || sticky) {
+  } else if (global || sticky) {
     // match found
     if (input->is_wide_char) {
       r->last_index = input->indices[(capture[1] - input->u.str8) / 2];
@@ -431,16 +421,17 @@ static int regexp_exec(lua_State *lstate)
   lua_pushinteger(lstate, capture_count);
   lua_setfield(lstate, -2, "capture_count");
 
-
   if (input->is_wide_char) {
-    lua_pushnumber(lstate, 1 + input->indices[(capture[0] - input->u.str8) / 2]); // 1-based
+    lua_pushnumber(
+        lstate,
+        1 + input->indices[(capture[0] - input->u.str8) / 2]); // 1-based
   } else {
     lua_pushnumber(lstate, 1 + capture[0] - input->u.str8); // 1-based
   }
   lua_setfield(lstate, -2, "index");
 
   if (group_names) {
-    lua_newtable(lstate);               // match.groups
+    lua_newtable(lstate); // match.groups
     lua_pushvalue(lstate, -1);
     lua_setfield(lstate, -3, "groups"); // immediately insert into match
     lua_insert(lstate, -2);             // leave table below the match table
@@ -448,11 +439,13 @@ static int regexp_exec(lua_State *lstate)
 
   for (int i = 0; i < capture_count; i++) {
     if (input->is_wide_char) {
-      const uint32_t a = input->indices[(capture[2*i] - input->u.str8) / 2];
-      const uint32_t b = input->indices[(capture[2*i+1] - input->u.str8) / 2];
-      lua_pushlstring(lstate, input->bstr+a, b-a);
+      const uint32_t a = input->indices[(capture[2 * i] - input->u.str8) / 2];
+      const uint32_t b =
+          input->indices[(capture[2 * i + 1] - input->u.str8) / 2];
+      lua_pushlstring(lstate, input->bstr + a, b - a);
     } else {
-      lua_pushlstring(lstate, (char *) capture[2*i], capture[2*i+1] - capture[2*i]);
+      lua_pushlstring(lstate, (char *)capture[2 * i],
+                      capture[2 * i + 1] - capture[2 * i]);
     }
     if (i > 0 && group_names) {
       // if the current group is named, duplicate and insert into the correct
@@ -471,9 +464,7 @@ static int regexp_exec(lua_State *lstate)
   return 1;
 }
 
-
-static int regexp_test(lua_State *lstate)
-{
+static int regexp_test(lua_State *lstate) {
   if (lua_gettop(lstate) != 2) {
     return luaL_error(lstate, "expecting exactly 2 arguments");
   }
@@ -484,10 +475,8 @@ static int regexp_test(lua_State *lstate)
   return 1;
 }
 
-
 // more gettable fields to be added here
-static int regexp_index(lua_State *lstate)
-{
+static int regexp_index(lua_State *lstate) {
   struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT);
 
   luaL_getmetatable(lstate, JSREGEXP_MT);
@@ -521,10 +510,8 @@ static int regexp_index(lua_State *lstate)
   return 1;
 }
 
-
 // only last_index should be settable
-static int regexp_newindex(lua_State *lstate)
-{
+static int regexp_newindex(lua_State *lstate) {
   struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT);
 
   const char *key = lua_tostring(lstate, 2);
@@ -539,20 +526,16 @@ static int regexp_newindex(lua_State *lstate)
   return 0;
 }
 
-static struct luaL_Reg jsregexp_meta[] = {
-  {"exec", regexp_exec},
-  {"test", regexp_test},
-  {"__gc", regexp_gc},
-  {"__call", regexp_call},
-  {"__tostring", regexp_tostring},
-  {"__index", regexp_index},
-  {"__newindex", regexp_newindex},
-  {NULL, NULL}
-};
-
+static struct luaL_Reg jsregexp_meta[] = {{"exec", regexp_exec},
+                                          {"test", regexp_test},
+                                          {"__gc", regexp_gc},
+                                          {"__call", regexp_call},
+                                          {"__tostring", regexp_tostring},
+                                          {"__index", regexp_index},
+                                          {"__newindex", regexp_newindex},
+                                          {NULL, NULL}};
 
-static int jsregexp_compile(lua_State *lstate)
-{
+static int jsregexp_compile(lua_State *lstate) {
   char error_msg[64];
   int len, re_flags = 0;
 
@@ -573,20 +556,34 @@ static int jsregexp_compile(lua_State *lstate)
     const char *flags = luaL_checkstring(lstate, 2);
     while (*flags) {
       switch (*(flags++)) {
-        case 'i': re_flags |= LRE_FLAG_IGNORECASE; break;
-        case 'g': re_flags |= LRE_FLAG_GLOBAL; break;
-        case 'm': re_flags |= LRE_FLAG_MULTILINE; break;
-        case 'n': re_flags |= LRE_FLAG_NAMED_GROUPS; break;
-        case 's': re_flags |= LRE_FLAG_DOTALL; break;
-        case 'u': re_flags |= LRE_FLAG_UTF16; break;
-        case 'y': re_flags |= LRE_FLAG_STICKY; break;
-        default: /* unknown flag */;
+      case 'i':
+        re_flags |= LRE_FLAG_IGNORECASE;
+        break;
+      case 'g':
+        re_flags |= LRE_FLAG_GLOBAL;
+        break;
+      case 'm':
+        re_flags |= LRE_FLAG_MULTILINE;
+        break;
+      case 'n':
+        re_flags |= LRE_FLAG_NAMED_GROUPS;
+        break;
+      case 's':
+        re_flags |= LRE_FLAG_DOTALL;
+        break;
+      case 'u':
+        re_flags |= LRE_FLAG_UTF16;
+        break;
+      case 'y':
+        re_flags |= LRE_FLAG_STICKY;
+        break;
+      default: /* unknown flag */;
       }
     }
   }
 
   uint8_t *bc = lre_compile(&len, error_msg, sizeof error_msg, regexp,
-      strlen(regexp), re_flags, NULL);
+                            strlen(regexp), re_flags, NULL);
 
   if (!bc) {
     luaL_argerror(lstate, 1, error_msg);
@@ -619,16 +616,13 @@ static int jsregexp_compile_safe(lua_State *lstate) {
   }
 }
 
-
 static const struct luaL_Reg jsregexp_lib[] = {
-  {"compile", jsregexp_compile},
-  {"compile_safe", jsregexp_compile_safe},
-  {"to_jsstring", jsstring_new},
-  {NULL, NULL}
-};
+    {"compile", jsregexp_compile},
+    {"compile_safe", jsregexp_compile_safe},
+    {"to_jsstring", jsstring_new},
+    {NULL, NULL}};
 
-int luaopen_jsregexp_core(lua_State *lstate)
-{
+int luaopen_jsregexp_core(lua_State *lstate) {
   luaL_newmetatable(lstate, JSREGEXP_MATCH_MT);
   lua_pushcfunction(lstate, match_tostring);
   lua_setfield(lstate, -2, "__tostring");
diff --git a/jsregexp.lua b/jsregexp.lua
index bfe4275..6af94b2 100644
--- a/jsregexp.lua
+++ b/jsregexp.lua
@@ -1,70 +1,92 @@
-local jsregexp = require "jsregexp.core"
+local jsregexp = require("jsregexp.core")
 
 setmetatable(jsregexp, {
-    __call = function(self, expr, flags) return jsregexp.compile(expr, flags) end
+	__call = function(self, expr, flags)
+		return jsregexp.compile(expr, flags)
+	end,
 })
 
 function jsregexp.mt.match(re, str)
-    local jstr = jsregexp.to_jsstring(str)
-    if not re.global then return re:exec(jstr) end
-    local matches = {}
-    local val
-
-    re.last_index = 1
-
-    while true do
-        val = re:exec(jstr)
-        if val == nil then break end
-        table.insert(matches, val)
-        if #val[0] == 0 then re.last_index = re.last_index + 1 end
-    end
-    if #matches == 0 then return nil end
-    return matches
+	local jstr = jsregexp.to_jsstring(str)
+	if not re.global then
+		return re:exec(jstr)
+	end
+	local matches = {}
+	local val
+
+	re.last_index = 1
+
+	while true do
+		val = re:exec(jstr)
+		if val == nil then
+			break
+		end
+		table.insert(matches, val)
+		if #val[0] == 0 then
+			re.last_index = re.last_index + 1
+		end
+	end
+	if #matches == 0 then
+		return nil
+	end
+	return matches
 end
 
 function jsregexp.mt.match_all(re, str)
-    -- must duplicate (according to string.proptype.matchAll spec)
-    local re2 = jsregexp.compile(re.source, re.flags)
-    local jstr = jsregexp.to_jsstring(str)
-    return function() return re2:exec(jstr) end
+	-- must duplicate (according to string.proptype.matchAll spec)
+	local re2 = jsregexp.compile(re.source, re.flags)
+	local jstr = jsregexp.to_jsstring(str)
+	return function()
+		return re2:exec(jstr)
+	end
 end
 
 function jsregexp.mt.match_all_list(re, str)
-    local matches = {}
-    for match in jsregexp.mt.match_all(re, str) do table.insert(matches, match) end
-    return matches
+	local matches = {}
+	for match in jsregexp.mt.match_all(re, str) do
+		table.insert(matches, match)
+	end
+	return matches
 end
 
 function jsregexp.mt.search(re, str)
-    -- spec says to start at 1 and restore last_index
-    local prev_last_index = re.last_index
-    re.last_index = 1
-    local match = re:exec(str)
-    re.last_index = prev_last_index
-    if match == nil then return -1 end
-    return match.index
+	-- spec says to start at 1 and restore last_index
+	local prev_last_index = re.last_index
+	re.last_index = 1
+	local match = re:exec(str)
+	re.last_index = prev_last_index
+	if match == nil then
+		return -1
+	end
+	return match.index
 end
 
 function jsregexp.mt.split(re, str, limit)
-    if limit == nil then limit = math.huge end
-    if limit == 0 then return {} end
-    assert(limit >= 0, "limit must be non-negative")
+	if limit == nil then
+		limit = math.huge
+	end
+	if limit == 0 then
+		return {}
+	end
+	assert(limit >= 0, "limit must be non-negative")
 
-    local jstr = jsregexp.to_jsstring(str)
-    local re2 = jsregexp.compile(re.source, re.flags .. "y") -- add sticky
+	local jstr = jsregexp.to_jsstring(str)
+	local re2 = jsregexp.compile(re.source, re.flags .. "y") -- add sticky
 
-    local count = 0
-    local split = {}
+	local count = 0
+	local split = {}
 	local prev_index = 1
-    while count < limit do
+	while count < limit do
 		local li = re2.last_index
-        local match = re2:exec(jstr)
+		local match = re2:exec(jstr)
 		if match then
 			if #str == 0 then
 				break
 			end
 			local sub = string.sub(str, prev_index, match.index - 1)
-			if #sub > 0 or #match[0] > 0 then table.insert(split, sub) end
+			if #sub > 0 or #match[0] > 0 then
+				table.insert(split, sub)
+			end
 			for _, group in ipairs(match) do
 				if count < limit then
 					table.insert(split, group)
@@ -83,105 +105,109 @@ function jsregexp.mt.split(re, str, limit)
 			table.insert(split, sub)
 			break
 		end
-    end
-    return split
+	end
+	return split
 end
 
 local function is_digit(c, i)
-    local b = string.byte(c, i, i + 1)
-    return b >= string.byte('0') and b <= string.byte('9')
+	local b = string.byte(c, i, i + 1)
+	return b >= string.byte("0") and b <= string.byte("9")
 end
 
 local function get_substitution(match, str, replacement)
-    local result = {}
-
-    local i = 1
-    local repl_len = #replacement
-
-    while true do
-        local j = string.find(replacement, "$", i, true)
-        if j == nil or j + 1 > repl_len then break end
-        table.insert(result, string.sub(replacement, i, j - 1))
-        local j0 = j
-        local c = string.sub(replacement, j + 1, j + 1)
-        j = j + 2
-        if c == '$' then
-            table.insert(result, "$")
-        elseif c == '&' then
-            table.insert(result, match[0])
-        elseif c == '`' then
-            table.insert(result, string.sub(str, 1, match.index))
-        elseif c == '\'' then
-            table.insert(result, string.sub(str, match.index + #match[0]))
-        elseif is_digit(c, 1) then
-            local k = c
-            local kv
-            local dig2 = false
-            if j <= repl_len and is_digit(replacement, j) then
-                k = k .. string.sub(replacement, j, j)
-                dig2 = true
-            end
-            local kv1 = tonumber(k)
-            assert(kv1 ~= nil)
-
-            -- This behavior is specified in ES6 and refined in ECMA 2019
-            if dig2 and kv1 >= 1 and match[kv1] ~= nil then
-                kv = kv1
-                j = j + 1
-            else
-                kv = tonumber(k)
-                assert(kv ~= nil)
-            end
-            if kv >= 1 and match[kv] ~= nil then
-                table.insert(result, match[kv])
-            else
-                table.insert(result, string.sub(replacement, j0, j))
-            end
-        elseif c == '<' and match.groups ~= nil then
-            local k = string.find(replacement, ">", j, true)
-            if k == nil then
-                table.insert(result, string.sub(replacement, j0, j))
-            else
-                local name = string.sub(replacement, j, k - 1)
-                local capture = match.groups[name]
-                assert(capture ~= nil, "invalid capture name: " .. name)
-                table.insert(result, capture)
-                j = k + 1
-            end
-        else
-            table.insert(result, string.sub(replacement, j0, j))
-        end
-
-        i = j
-    end
-    table.insert(result, string.sub(replacement, i))
-    return table.concat(result)
+	local result = {}
+
+	local i = 1
+	local repl_len = #replacement
+
+	while true do
+		local j = string.find(replacement, "$", i, true)
+		if j == nil or j + 1 > repl_len then
+			break
+		end
+		table.insert(result, string.sub(replacement, i, j - 1))
+		local j0 = j
+		local c = string.sub(replacement, j + 1, j + 1)
+		j = j + 2
+		if c == "$" then
+			table.insert(result, "$")
+		elseif c == "&" then
+			table.insert(result, match[0])
+		elseif c == "`" then
+			table.insert(result, string.sub(str, 1, match.index))
+		elseif c == "'" then
+			table.insert(result, string.sub(str, match.index + #match[0]))
+		elseif is_digit(c, 1) then
+			local k = c
+			local kv
+			local dig2 = false
+			if j <= repl_len and is_digit(replacement, j) then
+				k = k .. string.sub(replacement, j, j)
+				dig2 = true
+			end
+			local kv1 = tonumber(k)
+			assert(kv1 ~= nil)
+
+			-- This behavior is specified in ES6 and refined in ECMA 2019
+			if dig2 and kv1 >= 1 and match[kv1] ~= nil then
+				kv = kv1
+				j = j + 1
+			else
+				kv = tonumber(k)
+				assert(kv ~= nil)
+			end
+			if kv >= 1 and match[kv] ~= nil then
+				table.insert(result, match[kv])
+			else
+				table.insert(result, string.sub(replacement, j0, j))
+			end
+		elseif c == "<" and match.groups ~= nil then
+			local k = string.find(replacement, ">", j, true)
+			if k == nil then
+				table.insert(result, string.sub(replacement, j0, j))
+			else
+				local name = string.sub(replacement, j, k - 1)
+				local capture = match.groups[name]
+				assert(capture ~= nil, "invalid capture name: " .. name)
+				table.insert(result, capture)
+				j = k + 1
+			end
+		else
+			table.insert(result, string.sub(replacement, j0, j))
+		end
+
+		i = j
+	end
+	table.insert(result, string.sub(replacement, i))
+	return table.concat(result)
 end
 
 function jsregexp.mt.replace_all(re, str, replacement)
-    local jstr = jsregexp.to_jsstring(str)
-
-    re.last_index = 1
-
-    local output = {}
-
-    local prev_index = 1
-    local cur_index = 1
-    while true do
-        prev_index = re.last_index
-        local match = re:exec(jstr)
-        if match == nil then break end
-        cur_index = re.last_index
-
-        table.insert(output, string.sub(str, prev_index, match.index - 1))
-        if type(replacement) == "function" then
-            table.insert(output, replacement(match, str))
-        else
-            table.insert(output, get_substitution(match, str, replacement))
-        end
-    end
-    table.insert(output, string.sub(str, cur_index))
-    return table.concat(output)
+	local jstr = jsregexp.to_jsstring(str)
+
+	re.last_index = 1
+
+	local output = {}
+
+	local prev_index = 1
+	local cur_index = 1
+	while true do
+		prev_index = re.last_index
+		local match = re:exec(jstr)
+		if match == nil then
+			break
+		end
+		cur_index = re.last_index
+
+		table.insert(output, string.sub(str, prev_index, match.index - 1))
+		if type(replacement) == "function" then
+			table.insert(output, replacement(match, str))
+		else
+			table.insert(output, get_substitution(match, str, replacement))
+		end
+	end
+	table.insert(output, string.sub(str, cur_index))
+	return table.concat(output)
 end
 
 function jsregexp.mt.replace(re, str, replacement)
@@ -189,11 +215,11 @@ function jsregexp.mt.replace(re, str, replacement)
 		return jsregexp.mt.replace_all(re, str, replacement)
 	end
 
-    local jstr = jsregexp.to_jsstring(str)
+	local jstr = jsregexp.to_jsstring(str)
 
-    re.last_index = 1
+	re.last_index = 1
 
-    local output = {}
+	local output = {}
 
 	local match = re:exec(jstr)
 	if match then
diff --git a/test.lua b/test.lua
index 9fdd86c..7786b9f 100644
--- a/test.lua
+++ b/test.lua
@@ -65,10 +65,17 @@ local function test_call(str, regex, flags, want)
 				fails = fails + 1
 				return
 			end
-			for k,v in pairs(want.named_groups) do
+			for k, v in pairs(want.named_groups) do
 				if val.named_groups[k] ~= v then
 					fails = fails + 1
-					print(string.format("named group mismatch group '%s': expected '%s', actual '%s'", k, v, val.named_groups[k]))
+					print(
+						string.format(
+							"named group mismatch group '%s': expected '%s', actual '%s'",
+							k,
+							v,
+							val.named_groups[k]
+						)
+					)
 					return
 				end
 			end
@@ -117,7 +124,9 @@ local function test_exec(str, regex, flags, want)
 		if match_wanted.groups then
 			for key, val in pairs(match_wanted.groups) do
 				if val ~= match.groups[key] then
-					return fail(string.format("named group %s mismatch, wanted %s, got %s", key, val, match.groups[key]))
+					return fail(
+						string.format("named group %s mismatch, wanted %s, got %s", key, val, match.groups[key])
+					)
 				end
 			end
 		end
@@ -249,7 +258,7 @@ local function test_split(str, regex, flags, want)
 	end
 	local split = r:split(str)
 	local min = math.min(#split, #want)
-	for i = 1,min do
+	for i = 1, min do
 		local w = want[i]
 		if w ~= split[i] then
 			return fail("split mismatch, wanted %s, got %s", w, split[i])
@@ -286,95 +295,174 @@ test_compile("dummy", "[", "", nil)
 -- (luajit at least..)
 test_compile("dummy", string.char(0xfd, 166, 178, 165, 138, 183), "", nil)
 
-test_call("dummy", ".", "", {{"d"}})
-test_call("du", ".", "g", {{"d"}, {"u"}})
+test_call("dummy", ".", "", { { "d" } })
+test_call("du", ".", "g", { { "d" }, { "u" } })
 
 test_call("dummy", "c", "", {})
 test_call("dummy", "c", "g", {})
-test_call("dummy", "d", "", {{"d"}})
-test_call("dummy", "m", "", {{"m"}})
-test_call("dummy", "m", "g", {{"m"}, {"m"}})
-
-test_call("dummy", "(dummy)", "", {{"dummy", groups = {"dummy"}}})
-test_call("The quick brown fox jumps over the lazy dog", "\\w+", "", {{"The"}})
-test_call("The quick brown fox jumps over the lazy dog", "\\w+", "g", {{"The"}, {"quick"}, {"brown"}, {"fox"}, {"jumps"}, {"over"}, {"the"}, {"lazy"}, {"dog"}})
-test_call("The quick brown fox jumps over the lazy dog", "[aeiou]{2,}", "g", {{"ui"}})
-
-test_call("äöü", ".", "g", {{"ä"}, {"ö"}, {"ü"}})
-test_call("äöü", ".", "", {{"ä"}})
-test_call("ÄÖÜ", ".", "", {{"Ä"}})
-test_call("äöü", "[äöü]", "g", {{"ä"}, {"ö"}, {"ü"}})
-test_call("äöü", "[äöü]*", "g", {{"äöü"}, {""}})
-test_call("äÄ", "ä", "gi", {{"ä"}, {"Ä"}})
-test_call("öäü.haha", "([^.]*)\\.(.*)", "", {{"öäü.haha", groups={"öäü", "haha"}}})
-
-test_call("𝄞", "𝄞", "", {{"𝄞"}})
+test_call("dummy", "d", "", { { "d" } })
+test_call("dummy", "m", "", { { "m" } })
+test_call("dummy", "m", "g", { { "m" }, { "m" } })
+
+test_call("dummy", "(dummy)", "", { { "dummy", groups = { "dummy" } } })
+test_call("The quick brown fox jumps over the lazy dog", "\\w+", "", { { "The" } })
+test_call(
+	"The quick brown fox jumps over the lazy dog",
+	"\\w+",
+	"g",
+	{ { "The" }, { "quick" }, { "brown" }, { "fox" }, { "jumps" }, { "over" }, { "the" }, { "lazy" }, { "dog" } }
+)
+test_call("The quick brown fox jumps over the lazy dog", "[aeiou]{2,}", "g", { { "ui" } })
+
+test_call("äöü", ".", "g", { { "ä" }, { "ö" }, { "ü" } })
+test_call("äöü", ".", "", { { "ä" } })
+test_call("ÄÖÜ", ".", "", { { "Ä" } })
+test_call("äöü", "[äöü]", "g", { { "ä" }, { "ö" }, { "ü" } })
+test_call("äöü", "[äöü]*", "g", { { "äöü" }, { "" } })
+test_call("äÄ", "ä", "gi", { { "ä" }, { "Ä" } })
+test_call("öäü.haha", "([^.]*)\\.(.*)", "", { { "öäü.haha", groups = { "öäü", "haha" } } })
+
+test_call("𝄞", "𝄞", "", { { "𝄞" } })
 -- these empty matches are expected and consistent with vscode
-test_call("öö öö", "ö*", "g", {{"öö"}, {""}, {"öö"}, {""}})
-test_call("𝄞𝄞 𝄞𝄞", "[^ ]*", "g", {{"𝄞𝄞"}, {""}, {"𝄞𝄞"}, {""}})
-test_call("𝄞𝄞", "𝄞*", "", {{"𝄞𝄞"}})
+test_call("öö öö", "ö*", "g", { { "öö" }, { "" }, { "öö" }, { "" } })
+test_call("𝄞𝄞 𝄞𝄞", "[^ ]*", "g", { { "𝄞𝄞" }, { "" }, { "𝄞𝄞" }, { "" } })
+test_call("𝄞𝄞", "𝄞*", "", { { "𝄞𝄞" } })
 -- doesn't work in vscode, matches only a single 𝄞 each time:
-test_call("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", {{"𝄞𝄞"}, {""}, {"𝄞𝄞"}, {""}})
+test_call("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { { "𝄞𝄞" }, { "" }, { "𝄞𝄞" }, { "" } })
 -- vscode actually splits the center unicode character and produces an extra empty match. we don't.
-test_call("öö𐐷öö", "ö*", "g", {{"öö"}, {""}, {"öö"}, {""}})
-test_call("a", "𝄞|a", "g", {{"a"}}) -- utf16 regex, ascii input
+test_call("öö𐐷öö", "ö*", "g", { { "öö" }, { "" }, { "öö" }, { "" } })
+test_call("a", "𝄞|a", "g", { { "a" } }) -- utf16 regex, ascii input
 
-test_call("κόσμε", "(κόσμε)", "", {{"κόσμε", groups={"κόσμε"}}})
+test_call("κόσμε", "(κόσμε)", "", { { "κόσμε", groups = { "κόσμε" } } })
 
-test_call("jordbær fløde på", "(jordbær fløde på)", "", {{"jordbær fløde på", groups={"jordbær fløde på"}}})
+test_call(
+	"jordbær fløde på",
+	"(jordbær fløde på)",
+	"",
+	{ { "jordbær fløde på", groups = { "jordbær fløde på" } } }
+)
 
-test_call("Heizölrückstoßabdämpfung", "(Heizölrückstoßabdämpfung)", "", {{"Heizölrückstoßabdämpfung", groups={"Heizölrückstoßabdämpfung"}}})
+test_call(
+	"Heizölrückstoßabdämpfung",
+	"(Heizölrückstoßabdämpfung)",
+	"",
+	{ { "Heizölrückstoßabdämpfung", groups = { "Heizölrückstoßabdämpfung" } } }
+)
 
-test_call("Fête l'haï volapük", "(Fête l'haï volapük)", "", {{"Fête l'haï volapük", groups={"Fête l'haï volapük"}}})
+test_call(
+	"Fête l'haï volapük",
+	"(Fête l'haï volapük)",
+	"",
+	{ { "Fête l'haï volapük", groups = { "Fête l'haï volapük" } } }
+)
 
-test_call("Árvíztűrő tükörfúrógép", "(Árvíztűrő tükörfúrógép)", "", {{"Árvíztűrő tükörfúrógép", groups={"Árvíztűrő tükörfúrógép"}}})
+test_call(
+	"Árvíztűrő tükörfúrógép",
+	"(Árvíztűrő tükörfúrógép)",
+	"",
+	{ { "Árvíztűrő tükörfúrógép", groups = { "Árvíztűrő tükörfúrógép" } } }
+)
 
-test_call("いろはにほへとちりぬるを", "(いろはにほへとちりぬるを)", "", {{"いろはにほへとちりぬるを", groups={"いろはにほへとちりぬるを"}}})
+test_call(
+	"いろはにほへとちりぬるを",
+	"(いろはにほへとちりぬるを)",
+	"",
+	{ { "いろはにほへとちりぬるを", groups = { "いろはにほへとちりぬるを" } } }
+)
 
-test_call("Съешь же ещё этих мягких французских булок да выпей чаю", "(Съешь же ещё этих мягких французских булок да выпей чаю)", "", {{"Съешь же ещё этих мягких французских булок да выпей чаю", groups={"Съешь же ещё этих мягких французских булок да выпей чаю"}}})
+test_call(
+	"Съешь же ещё этих мягких французских булок да выпей чаю",
+	"(Съешь же ещё этих мягких французских булок да выпей чаю)",
+	"",
+	{
+		{
+			"Съешь же ещё этих мягких французских булок да выпей чаю",
+			groups = {
+				"Съешь же ещё этих мягких французских булок да выпей чаю",
+			},
+		},
+	}
+)
 
 -- no idea how thai works
 -- test("จงฝ่าฟันพัฒนาวิชาการ", "(จงฝ่าฟันพัฒนาวิชาการ)", "", {{"จงฝ่าฟันพัฒนาวิชาการ", groups="จงฝ่าฟันพัฒนาวิชาการ"}})
 
-
 -- named groups:
-test_call("The quick brown fox jumps over the lazy dog", "(?<first_word>\\w+) (\\w+) (?<third_word>\\w+)", "n",
-{{"The quick brown", groups={"The", "quick", "brown"}, named_groups={first_word="The", third_word="brown"}}}
+test_call(
+	"The quick brown fox jumps over the lazy dog",
+	"(?<first_word>\\w+) (\\w+) (?<third_word>\\w+)",
+	"n",
+	{ { "The quick brown", groups = { "The", "quick", "brown" }, named_groups = { first_word = "The", third_word = "brown" } } }
 )
-test_call("The qüick bröwn föx jümps över the lazy dög", "(?<first_word>[^ ]+) ([^ ]+) (?<third_word>[^ ]+)", "n",
-{{"The qüick bröwn", groups={"The", "qüick", "bröwn"}, named_groups={first_word="The", third_word="bröwn"}}}
+test_call(
+	"The qüick bröwn föx jümps över the lazy dög",
+	"(?<first_word>[^ ]+) ([^ ]+) (?<third_word>[^ ]+)",
+	"n",
+	{ { "The qüick bröwn", groups = { "The", "qüick", "bröwn" }, named_groups = {
+		first_word = "The",
+		third_word = "bröwn",
+	} } }
 )
-test_call("The quick bröwn föx", "(?<first_wörd>[^ ]+) ([^ ]+) (?<third_wörd>[^ ]+)", "n",
-{{"The quick bröwn", groups={"The", "quick", "bröwn"}, named_groups={["first_wörd"]="The", ["third_wörd"]="bröwn"}}}
+test_call(
+	"The quick bröwn föx",
+	"(?<first_wörd>[^ ]+) ([^ ]+) (?<third_wörd>[^ ]+)",
+	"n",
+	{
+		{
+			"The quick bröwn",
+			groups = { "The", "quick", "bröwn" },
+			named_groups = { ["first_wörd"] = "The", ["third_wörd"] = "bröwn" },
+		},
+	}
+)
+test_call(
+	"𝄞𝄞 𐐷",
+	"(?<word>[^ ]+)",
+	"ng",
+	{
+		{ "𝄞𝄞", groups = { "𝄞𝄞" }, named_groups = { word = "𝄞𝄞" } },
+		{ "𐐷", groups = { "𐐷" }, named_groups = { word = "𐐷" } },
+	}
 )
-test_call("𝄞𝄞 𐐷", "(?<word>[^ ]+)", "ng", {{"𝄞𝄞", groups={"𝄞𝄞"}, named_groups={word="𝄞𝄞"}}, {"𐐷", groups={"𐐷"}, named_groups={word="𐐷"}}})
 
-test_exec("The quick brown", "\\w+", "g", {{[0]="The"}, {[0]="quick"}, {[0]="brown"}})
-test_exec("The quick brown fox", "(\\w+) (\\w+)", "g", {{[0]="The quick", "The", "quick"}, {[0]="brown fox", "brown", "fox"}})
-test_exec("The quick brown fox", "(?<word1>\\w+) (\\w+)", "g",
-{{[0]="The quick", "The", "quick", groups={word1="The"}}, {[0]="brown fox", "brown", "fox", groups={word1="brown"}}})
+test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } })
+test_exec(
+	"The quick brown fox",
+	"(\\w+) (\\w+)",
+	"g",
+	{ { [0] = "The quick", "The", "quick" }, { [0] = "brown fox", "brown", "fox" } }
+)
+test_exec(
+	"The quick brown fox",
+	"(?<word1>\\w+) (\\w+)",
+	"g",
+	{
+		{ [0] = "The quick", "The", "quick", groups = { word1 = "The" } },
+		{ [0] = "brown fox", "brown", "fox", groups = { word1 = "brown" } },
+	}
+)
 
-test_test("The quick brown", "\\w+", "", {true})
-test_test("The quick brown", "\\d+", "", {false})
-test_test("The quick brown", "\\w+", "g", {true, true, true})
+test_test("The quick brown", "\\w+", "", { true })
+test_test("The quick brown", "\\d+", "", { false })
+test_test("The quick brown", "\\w+", "g", { true, true, true })
 
 test_match("The quick brown", "\\d+", "g", nil)
-test_match("The quick brown", "\\w+", "g", {"The", "quick", "brown"})
+test_match("The quick brown", "\\w+", "g", { "The", "quick", "brown" })
 
 test_match_all_list("The quick brown", "\\d+", "g", {})
-test_match_all_list("The quick brown", "\\w+", "g", {"The", "quick", "brown"})
+test_match_all_list("The quick brown", "\\w+", "g", { "The", "quick", "brown" })
 
 test_search("The quick brown", "nothing", "g", -1)
 test_search("The quick brown", "quick", "g", 5)
 
-test_split("abc", "x", "g", {"abc"})
+test_split("abc", "x", "g", { "abc" })
 test_split("", "a?", "g", {})
-test_split("", "a", "g", {""})
-test_split("1-2-3", "-", "g", {"1", "2", "3"})
-test_split("1-2-", "-", "g", {"1", "2", ""})
-test_split("-2-3", "-", "g", {"", "2", "3"})
-test_split("--", "-", "g", {"", "", ""})
-test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", {"Hello ", "1", " word. Sentence number ", "2", "."})
+test_split("", "a", "g", { "" })
+test_split("1-2-3", "-", "g", { "1", "2", "3" })
+test_split("1-2-", "-", "g", { "1", "2", "" })
+test_split("-2-3", "-", "g", { "", "2", "3" })
+test_split("--", "-", "g", { "", "", "" })
+test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", { "Hello ", "1", " word. Sentence number ", "2", "." })
 
 test_replace("a1b2c", "X", "g", "_", "a1b2c")
 test_replace("a1b2c", "\\d", "", "_", "a_b2c")

From da4d4c036d5c5217ffdfb12fafcb6298a741cc7e Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Tue, 25 Jun 2024 15:18:28 +0200
Subject: [PATCH 4/8] update libregexp

---
 jsregexp.c                   |   16 +-
 libregexp/cutils.c           |    6 +-
 libregexp/cutils.h           |   72 +-
 libregexp/libregexp-opcode.h |    5 +-
 libregexp/libregexp.c        | 4393 +++++++++++++++++-----------------
 libregexp/libregexp.h        |   49 +-
 libregexp/libunicode-table.h |  285 ++-
 libregexp/libunicode.c       |  564 ++++-
 libregexp/libunicode.h       |  102 +-
 9 files changed, 2980 insertions(+), 2512 deletions(-)

diff --git a/jsregexp.c b/jsregexp.c
index 305256d..e01d2be 100644
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -31,6 +31,14 @@
 
 #define streq(X, Y) ((*(X) == *(Y)) && strcmp(X, Y) == 0)
 
+// these two functions need to be defined for libregexp
+void *lre_realloc(void *opaque, void *ptr, size_t size) {
+  return realloc(ptr, size);
+}
+BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) {
+  return FALSE;
+}
+
 struct regexp {
   char *expr;
   uint8_t *bc;
@@ -332,7 +340,7 @@ static void regexp_pushflags(lua_State *lstate, const struct regexp *r) {
   const char *multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : "";
   const char *named_groups = (flags & LRE_FLAG_NAMED_GROUPS) ? "n" : "";
   const char *dotall = (flags & LRE_FLAG_DOTALL) ? "s" : "";
-  const char *utf16 = (flags & LRE_FLAG_UTF16) ? "u" : "";
+  const char *utf16 = (flags & LRE_FLAG_UNICODE) ? "u" : "";
   const char *sticky = (flags & LRE_FLAG_STICKY) ? "y" : "";
   lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline,
                   named_groups, dotall, utf16, sticky);
@@ -498,7 +506,7 @@ static int regexp_index(lua_State *lstate) {
     } else if (streq(key, "sticky")) {
       lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_STICKY);
     } else if (streq(key, "unicode")) {
-      lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_UTF16);
+      lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_UNICODE);
     } else if (streq(key, "source")) {
       lua_pushstring(lstate, r->expr);
     } else if (streq(key, "flags")) {
@@ -549,7 +557,7 @@ static int jsregexp_compile(lua_State *lstate) {
 
   if (utf8_contains_non_bmp(regexp)) {
     // bmp range works fine without utf16 flag
-    re_flags |= LRE_FLAG_UTF16;
+    re_flags |= LRE_FLAG_UNICODE;
   }
 
   if (!lua_isnoneornil(lstate, 2)) {
@@ -572,7 +580,7 @@ static int jsregexp_compile(lua_State *lstate) {
         re_flags |= LRE_FLAG_DOTALL;
         break;
       case 'u':
-        re_flags |= LRE_FLAG_UTF16;
+        re_flags |= LRE_FLAG_UNICODE;
         break;
       case 'y':
         re_flags |= LRE_FLAG_STICKY;
diff --git a/libregexp/cutils.c b/libregexp/cutils.c
index a02fb76..c0aacef 100644
--- a/libregexp/cutils.c
+++ b/libregexp/cutils.c
@@ -1,6 +1,6 @@
 /*
  * C utilities
- * 
+ *
  * Copyright (c) 2017 Fabrice Bellard
  * Copyright (c) 2018 Charlie Gordon
  *
@@ -140,7 +140,7 @@ int dbuf_put(DynBuf *s, const uint8_t *data, size_t len)
         if (dbuf_realloc(s, s->size + len))
             return -1;
     }
-    memcpy(s->buf + s->size, data, len);
+    memcpy_no_ub(s->buf + s->size, data, len);
     s->size += len;
     return 0;
 }
@@ -172,7 +172,7 @@ int __attribute__((format(printf, 2, 3))) dbuf_printf(DynBuf *s,
     va_list ap;
     char buf[128];
     int len;
-    
+
     va_start(ap, fmt);
     len = vsnprintf(buf, sizeof(buf), fmt, ap);
     va_end(ap);
diff --git a/libregexp/cutils.h b/libregexp/cutils.h
index 31f7cd8..f079e5c 100644
--- a/libregexp/cutils.h
+++ b/libregexp/cutils.h
@@ -1,6 +1,6 @@
 /*
  * C utilities
- * 
+ *
  * Copyright (c) 2017 Fabrice Bellard
  * Copyright (c) 2018 Charlie Gordon
  *
@@ -26,11 +26,9 @@
 #define CUTILS_H
 
 #include <stdlib.h>
+#include <string.h>
 #include <inttypes.h>
 
-/* set if CPU is big endian */
-#undef WORDS_BIGENDIAN
-
 #define likely(x)       __builtin_expect(!!(x), 1)
 #define unlikely(x)     __builtin_expect(!!(x), 0)
 #define force_inline inline __attribute__((always_inline))
@@ -48,6 +46,16 @@
 #ifndef countof
 #define countof(x) (sizeof(x) / sizeof((x)[0]))
 #endif
+#ifndef container_of
+/* return the pointer of type 'type *' containing 'ptr' as field 'member' */
+#define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+#if !defined(_MSC_VER) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define minimum_length(n)  static n
+#else
+#define minimum_length(n)  n
+#endif
 
 typedef int BOOL;
 
@@ -63,6 +71,12 @@ char *pstrcat(char *buf, int buf_size, const char *s);
 int strstart(const char *str, const char *val, const char **ptr);
 int has_suffix(const char *str, const char *suffix);
 
+/* Prevent UB when n == 0 and (src == NULL or dest == NULL) */
+static inline void memcpy_no_ub(void *dest, const void *src, size_t n) {
+    if (n)
+        memcpy(dest, src, n);
+}
+
 static inline int max_int(int a, int b)
 {
     if (a > b)
@@ -207,28 +221,34 @@ static inline void put_u8(uint8_t *tab, uint8_t val)
     *tab = val;
 }
 
+#ifndef bswap16
 static inline uint16_t bswap16(uint16_t x)
 {
     return (x >> 8) | (x << 8);
 }
+#endif
 
+#ifndef bswap32
 static inline uint32_t bswap32(uint32_t v)
 {
     return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >>  8) |
         ((v & 0x0000ff00) <<  8) | ((v & 0x000000ff) << 24);
 }
+#endif
 
+#ifndef bswap64
 static inline uint64_t bswap64(uint64_t v)
 {
-    return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) | 
-        ((v & ((uint64_t)0xff << (6 * 8))) >> (5 * 8)) | 
-        ((v & ((uint64_t)0xff << (5 * 8))) >> (3 * 8)) | 
-        ((v & ((uint64_t)0xff << (4 * 8))) >> (1 * 8)) | 
-        ((v & ((uint64_t)0xff << (3 * 8))) << (1 * 8)) | 
-        ((v & ((uint64_t)0xff << (2 * 8))) << (3 * 8)) | 
-        ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) | 
+    return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) |
+        ((v & ((uint64_t)0xff << (6 * 8))) >> (5 * 8)) |
+        ((v & ((uint64_t)0xff << (5 * 8))) >> (3 * 8)) |
+        ((v & ((uint64_t)0xff << (4 * 8))) >> (1 * 8)) |
+        ((v & ((uint64_t)0xff << (3 * 8))) << (1 * 8)) |
+        ((v & ((uint64_t)0xff << (2 * 8))) << (3 * 8)) |
+        ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) |
         ((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8));
 }
+#endif
 
 /* XXX: should take an extra argument to pass slack information to the caller */
 typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size);
@@ -278,6 +298,36 @@ static inline void dbuf_set_error(DynBuf *s)
 int unicode_to_utf8(uint8_t *buf, unsigned int c);
 int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
 
+static inline BOOL is_surrogate(uint32_t c)
+{
+    return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF
+}
+
+static inline BOOL is_hi_surrogate(uint32_t c)
+{
+    return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF
+}
+
+static inline BOOL is_lo_surrogate(uint32_t c)
+{
+    return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF
+}
+
+static inline uint32_t get_hi_surrogate(uint32_t c)
+{
+    return (c >> 10) - (0x10000 >> 10) + 0xD800;
+}
+
+static inline uint32_t get_lo_surrogate(uint32_t c)
+{
+    return (c & 0x3FF) | 0xDC00;
+}
+
+static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo)
+{
+    return 0x10000 + 0x400 * (hi - 0xD800) + (lo - 0xDC00);
+}
+
 static inline int from_hex(int c)
 {
     if (c >= '0' && c <= '9')
diff --git a/libregexp/libregexp-opcode.h b/libregexp/libregexp-opcode.h
index f90c23b..f255e09 100644
--- a/libregexp/libregexp-opcode.h
+++ b/libregexp/libregexp-opcode.h
@@ -1,6 +1,6 @@
 /*
  * Regular Expression Engine
- * 
+ *
  * Copyright (c) 2017-2018 Fabrice Bellard
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -50,8 +50,7 @@ DEF(range32, 3) /* variable length */
 DEF(lookahead, 5)
 DEF(negative_lookahead, 5)
 DEF(push_char_pos, 1) /* push the character position on the stack */
-DEF(bne_char_pos, 5) /* pop one stack element and jump if equal to the character
- position */
+DEF(check_advance, 1) /* pop one stack element and check that it is different from the character position */
 DEF(prev, 1) /* go to the previous char */
 DEF(simple_greedy_quant, 17)
 
diff --git a/libregexp/libregexp.c b/libregexp/libregexp.c
index 9637aed..a2d56a7 100644
--- a/libregexp/libregexp.c
+++ b/libregexp/libregexp.c
@@ -21,22 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-#include <assert.h>
-#include <inttypes.h>
-#include <stdarg.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <inttypes.h>
 #include <string.h>
+#include <assert.h>
 
 #include "cutils.h"
 #include "libregexp.h"
+#include "libunicode.h"
 
 /*
   TODO:
 
-  - Add full unicode canonicalize rules for character ranges (not
-    really useful but needed for exact "ignorecase" compatibility).
-
   - Add a lock step execution mode (=linear time execution guaranteed)
     when the regular expression is "simple" i.e. no backreference nor
     complicated lookahead. The opcodes are designed for this execution
@@ -48,417 +46,357 @@
 #endif
 
 typedef enum {
-#define DEF(id, size) REOP_##id,
+#define DEF(id, size) REOP_ ## id,
 #include "libregexp-opcode.h"
 #undef DEF
-  REOP_COUNT,
+    REOP_COUNT,
 } REOPCodeEnum;
 
 #define CAPTURE_COUNT_MAX 255
 #define STACK_SIZE_MAX 255
 
 /* unicode code points */
-#define CP_LS 0x2028
-#define CP_PS 0x2029
+#define CP_LS   0x2028
+#define CP_PS   0x2029
 
 #define TMP_BUF_SIZE 128
 
 typedef struct {
-  DynBuf byte_code;
-  const uint8_t *buf_ptr;
-  const uint8_t *buf_end;
-  const uint8_t *buf_start;
-  int re_flags;
-  BOOL is_utf16;
-  BOOL ignore_case;
-  BOOL dotall;
-  int capture_count;
-  int total_capture_count; /* -1 = not computed yet */
-  int has_named_captures;  /* -1 = don't know, 0 = no, 1 = yes */
-  void *opaque;
-  DynBuf group_names;
-  union {
-    char error_msg[TMP_BUF_SIZE];
-    char tmp_buf[TMP_BUF_SIZE];
-  } u;
+    DynBuf byte_code;
+    const uint8_t *buf_ptr;
+    const uint8_t *buf_end;
+    const uint8_t *buf_start;
+    int re_flags;
+    BOOL is_unicode;
+    BOOL ignore_case;
+    BOOL dotall;
+    int capture_count;
+    int total_capture_count; /* -1 = not computed yet */
+    int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
+    void *opaque;
+    DynBuf group_names;
+    union {
+        char error_msg[TMP_BUF_SIZE];
+        char tmp_buf[TMP_BUF_SIZE];
+    } u;
 } REParseState;
 
 typedef struct {
 #ifdef DUMP_REOP
-  const char *name;
+    const char *name;
 #endif
-  uint8_t size;
+    uint8_t size;
 } REOpCode;
 
 static const REOpCode reopcode_info[REOP_COUNT] = {
 #ifdef DUMP_REOP
-#define DEF(id, size) {#id, size},
+#define DEF(id, size) { #id, size },
 #else
-#define DEF(id, size) {size},
+#define DEF(id, size) { size },
 #endif
 #include "libregexp-opcode.h"
 #undef DEF
 };
 
-#define RE_HEADER_FLAGS 0
+#define RE_HEADER_FLAGS         0
 #define RE_HEADER_CAPTURE_COUNT 1
-#define RE_HEADER_STACK_SIZE 2
+#define RE_HEADER_STACK_SIZE    2
+#define RE_HEADER_BYTECODE_LEN  3
 
 #define RE_HEADER_LEN 7
 
-static inline int is_digit(int c) { return c >= '0' && c <= '9'; }
-
-/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
-static int dbuf_insert(DynBuf *s, int pos, int len) {
-  if (dbuf_realloc(s, s->size + len))
-    return -1;
-  memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
-  s->size += len;
-  return 0;
+static inline int is_digit(int c) {
+    return c >= '0' && c <= '9';
 }
 
-/* canonicalize with the specific JS regexp rules */
-static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16) {
-  uint32_t res[LRE_CC_RES_LEN_MAX];
-  int len;
-  if (is_utf16) {
-    if (likely(c < 128)) {
-      if (c >= 'A' && c <= 'Z')
-        c = c - 'A' + 'a';
-    } else {
-      lre_case_conv(res, c, 2);
-      c = res[0];
-    }
-  } else {
-    if (likely(c < 128)) {
-      if (c >= 'a' && c <= 'z')
-        c = c - 'a' + 'A';
-    } else {
-      /* legacy regexp: to upper case if single char >= 128 */
-      len = lre_case_conv(res, c, FALSE);
-      if (len == 1 && res[0] >= 128)
-        c = res[0];
-    }
-  }
-  return c;
+/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
+static int dbuf_insert(DynBuf *s, int pos, int len)
+{
+    if (dbuf_realloc(s, s->size + len))
+        return -1;
+    memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
+    s->size += len;
+    return 0;
 }
 
 static const uint16_t char_range_d[] = {
     1,
-    0x0030,
-    0x0039 + 1,
+    0x0030, 0x0039 + 1,
 };
 
 /* code point ranges for Zs,Zl or Zp property */
 static const uint16_t char_range_s[] = {
     10,
-    0x0009,
-    0x000D + 1,
-    0x0020,
-    0x0020 + 1,
-    0x00A0,
-    0x00A0 + 1,
-    0x1680,
-    0x1680 + 1,
-    0x2000,
-    0x200A + 1,
+    0x0009, 0x000D + 1,
+    0x0020, 0x0020 + 1,
+    0x00A0, 0x00A0 + 1,
+    0x1680, 0x1680 + 1,
+    0x2000, 0x200A + 1,
     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
-    0x2028,
-    0x2029 + 1,
-    0x202F,
-    0x202F + 1,
-    0x205F,
-    0x205F + 1,
-    0x3000,
-    0x3000 + 1,
+    0x2028, 0x2029 + 1,
+    0x202F, 0x202F + 1,
+    0x205F, 0x205F + 1,
+    0x3000, 0x3000 + 1,
     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
-    0xFEFF,
-    0xFEFF + 1,
+    0xFEFF, 0xFEFF + 1,
 };
 
-BOOL lre_is_space(int c) {
-  int i, n, low, high;
-  n = (countof(char_range_s) - 1) / 2;
-  for (i = 0; i < n; i++) {
-    low = char_range_s[2 * i + 1];
-    if (c < low)
-      return FALSE;
-    high = char_range_s[2 * i + 2];
-    if (c < high)
-      return TRUE;
-  }
-  return FALSE;
-}
-
-uint32_t const lre_id_start_table_ascii[4] = {
-    /* $ A-Z _ a-z */
-    0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
-
-uint32_t const lre_id_continue_table_ascii[4] = {
-    /* $ 0-9 A-Z _ a-z */
-    0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE};
-
 static const uint16_t char_range_w[] = {
-    4,      0x0030,     0x0039 + 1, 0x0041,     0x005A + 1,
-    0x005F, 0x005F + 1, 0x0061,     0x007A + 1,
+    4,
+    0x0030, 0x0039 + 1,
+    0x0041, 0x005A + 1,
+    0x005F, 0x005F + 1,
+    0x0061, 0x007A + 1,
 };
 
 #define CLASS_RANGE_BASE 0x40000000
 
 typedef enum {
-  CHAR_RANGE_d,
-  CHAR_RANGE_D,
-  CHAR_RANGE_s,
-  CHAR_RANGE_S,
-  CHAR_RANGE_w,
-  CHAR_RANGE_W,
+    CHAR_RANGE_d,
+    CHAR_RANGE_D,
+    CHAR_RANGE_s,
+    CHAR_RANGE_S,
+    CHAR_RANGE_w,
+    CHAR_RANGE_W,
 } CharRangeEnum;
 
-static const uint16_t *char_range_table[] = {
+static const uint16_t * const char_range_table[] = {
     char_range_d,
     char_range_s,
     char_range_w,
 };
 
-static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) {
-  BOOL invert;
-  const uint16_t *c_pt;
-  int len, i;
-
-  invert = c & 1;
-  c_pt = char_range_table[c >> 1];
-  len = *c_pt++;
-  cr_init(cr, s->opaque, lre_realloc);
-  for (i = 0; i < len * 2; i++) {
-    if (cr_add_point(cr, c_pt[i]))
-      goto fail;
-  }
-  if (invert) {
-    if (cr_invert(cr))
-      goto fail;
-  }
-  return 0;
-fail:
-  cr_free(cr);
-  return -1;
-}
+static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
+{
+    BOOL invert;
+    const uint16_t *c_pt;
+    int len, i;
 
-static int cr_canonicalize(CharRange *cr) {
-  CharRange a;
-  uint32_t pt[2];
-  int i, ret;
-
-  cr_init(&a, cr->mem_opaque, lre_realloc);
-  pt[0] = 'a';
-  pt[1] = 'z' + 1;
-  ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
-  if (ret)
-    goto fail;
-  /* convert to upper case */
-  /* XXX: the generic unicode case would be much more complicated
-     and not really useful */
-  for (i = 0; i < a.len; i++) {
-    a.points[i] += 'A' - 'a';
-  }
-  /* Note: for simplicity we keep the lower case ranges */
-  ret = cr_union1(cr, a.points, a.len);
-fail:
-  cr_free(&a);
-  return ret;
+    invert = c & 1;
+    c_pt = char_range_table[c >> 1];
+    len = *c_pt++;
+    cr_init(cr, s->opaque, lre_realloc);
+    for(i = 0; i < len * 2; i++) {
+        if (cr_add_point(cr, c_pt[i]))
+            goto fail;
+    }
+    if (invert) {
+        if (cr_invert(cr))
+            goto fail;
+    }
+    return 0;
+ fail:
+    cr_free(cr);
+    return -1;
 }
 
 #ifdef DUMP_REOP
-static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, int buf_len) {
-  int pos, len, opcode, bc_len, re_flags, i;
-  uint32_t val;
-
-  assert(buf_len >= RE_HEADER_LEN);
-
-  re_flags = buf[0];
-  bc_len = get_u32(buf + 3);
-  assert(bc_len + RE_HEADER_LEN <= buf_len);
-  printf("flags: 0x%x capture_count=%d stack_size=%d\n", re_flags, buf[1],
-         buf[2]);
-  if (re_flags & LRE_FLAG_NAMED_GROUPS) {
-    const char *p;
-    p = (char *)buf + RE_HEADER_LEN + bc_len;
-    printf("named groups: ");
-    for (i = 1; i < buf[1]; i++) {
-      if (i != 1)
-        printf(",");
-      printf("<%s>", p);
-      p += strlen(p) + 1;
-    }
-    printf("\n");
-    assert(p == (char *)(buf + buf_len));
-  }
-  printf("bytecode_len=%d\n", bc_len);
-
-  buf += RE_HEADER_LEN;
-  pos = 0;
-  while (pos < bc_len) {
-    printf("%5u: ", pos);
-    opcode = buf[pos];
-    len = reopcode_info[opcode].size;
-    if (opcode >= REOP_COUNT) {
-      printf(" invalid opcode=0x%02x\n", opcode);
-      break;
-    }
-    if ((pos + len) > bc_len) {
-      printf(" buffer overflow (opcode=0x%02x)\n", opcode);
-      break;
+static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
+                                                     int buf_len)
+{
+    int pos, len, opcode, bc_len, re_flags, i;
+    uint32_t val;
+
+    assert(buf_len >= RE_HEADER_LEN);
+
+    re_flags = lre_get_flags(buf);
+    bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
+    assert(bc_len + RE_HEADER_LEN <= buf_len);
+    printf("flags: 0x%x capture_count=%d stack_size=%d\n",
+           re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
+    if (re_flags & LRE_FLAG_NAMED_GROUPS) {
+        const char *p;
+        p = (char *)buf + RE_HEADER_LEN + bc_len;
+        printf("named groups: ");
+        for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
+            if (i != 1)
+                printf(",");
+            printf("<%s>", p);
+            p += strlen(p) + 1;
+        }
+        printf("\n");
+        assert(p == (char *)(buf + buf_len));
     }
-    printf("%s", reopcode_info[opcode].name);
-    switch (opcode) {
-    case REOP_char:
-      val = get_u16(buf + pos + 1);
-      if (val >= ' ' && val <= 126)
-        printf(" '%c'", val);
-      else
-        printf(" 0x%04x", val);
-      break;
-    case REOP_char32:
-      val = get_u32(buf + pos + 1);
-      if (val >= ' ' && val <= 126)
-        printf(" '%c'", val);
-      else
-        printf(" 0x%08x", val);
-      break;
-    case REOP_goto:
-    case REOP_split_goto_first:
-    case REOP_split_next_first:
-    case REOP_loop:
-    case REOP_lookahead:
-    case REOP_negative_lookahead:
-    case REOP_bne_char_pos:
-      val = get_u32(buf + pos + 1);
-      val += (pos + 5);
-      printf(" %u", val);
-      break;
-    case REOP_simple_greedy_quant:
-      printf(" %u %u %u %u", get_u32(buf + pos + 1) + (pos + 17),
-             get_u32(buf + pos + 1 + 4), get_u32(buf + pos + 1 + 8),
-             get_u32(buf + pos + 1 + 12));
-      break;
-    case REOP_save_start:
-    case REOP_save_end:
-    case REOP_back_reference:
-    case REOP_backward_back_reference:
-      printf(" %u", buf[pos + 1]);
-      break;
-    case REOP_save_reset:
-      printf(" %u %u", buf[pos + 1], buf[pos + 2]);
-      break;
-    case REOP_push_i32:
-      val = get_u32(buf + pos + 1);
-      printf(" %d", val);
-      break;
-    case REOP_range: {
-      int n, i;
-      n = get_u16(buf + pos + 1);
-      len += n * 4;
-      for (i = 0; i < n * 2; i++) {
-        val = get_u16(buf + pos + 3 + i * 2);
-        printf(" 0x%04x", val);
-      }
-    } break;
-    case REOP_range32: {
-      int n, i;
-      n = get_u16(buf + pos + 1);
-      len += n * 8;
-      for (i = 0; i < n * 2; i++) {
-        val = get_u32(buf + pos + 3 + i * 4);
-        printf(" 0x%08x", val);
-      }
-    } break;
-    default:
-      break;
+    printf("bytecode_len=%d\n", bc_len);
+
+    buf += RE_HEADER_LEN;
+    pos = 0;
+    while (pos < bc_len) {
+        printf("%5u: ", pos);
+        opcode = buf[pos];
+        len = reopcode_info[opcode].size;
+        if (opcode >= REOP_COUNT) {
+            printf(" invalid opcode=0x%02x\n", opcode);
+            break;
+        }
+        if ((pos + len) > bc_len) {
+            printf(" buffer overflow (opcode=0x%02x)\n", opcode);
+            break;
+        }
+        printf("%s", reopcode_info[opcode].name);
+        switch(opcode) {
+        case REOP_char:
+            val = get_u16(buf + pos + 1);
+            if (val >= ' ' && val <= 126)
+                printf(" '%c'", val);
+            else
+                printf(" 0x%04x", val);
+            break;
+        case REOP_char32:
+            val = get_u32(buf + pos + 1);
+            if (val >= ' ' && val <= 126)
+                printf(" '%c'", val);
+            else
+                printf(" 0x%08x", val);
+            break;
+        case REOP_goto:
+        case REOP_split_goto_first:
+        case REOP_split_next_first:
+        case REOP_loop:
+        case REOP_lookahead:
+        case REOP_negative_lookahead:
+            val = get_u32(buf + pos + 1);
+            val += (pos + 5);
+            printf(" %u", val);
+            break;
+        case REOP_simple_greedy_quant:
+            printf(" %u %u %u %u",
+                   get_u32(buf + pos + 1) + (pos + 17),
+                   get_u32(buf + pos + 1 + 4),
+                   get_u32(buf + pos + 1 + 8),
+                   get_u32(buf + pos + 1 + 12));
+            break;
+        case REOP_save_start:
+        case REOP_save_end:
+        case REOP_back_reference:
+        case REOP_backward_back_reference:
+            printf(" %u", buf[pos + 1]);
+            break;
+        case REOP_save_reset:
+            printf(" %u %u", buf[pos + 1], buf[pos + 2]);
+            break;
+        case REOP_push_i32:
+            val = get_u32(buf + pos + 1);
+            printf(" %d", val);
+            break;
+        case REOP_range:
+            {
+                int n, i;
+                n = get_u16(buf + pos + 1);
+                len += n * 4;
+                for(i = 0; i < n * 2; i++) {
+                    val = get_u16(buf + pos + 3 + i * 2);
+                    printf(" 0x%04x", val);
+                }
+            }
+            break;
+        case REOP_range32:
+            {
+                int n, i;
+                n = get_u16(buf + pos + 1);
+                len += n * 8;
+                for(i = 0; i < n * 2; i++) {
+                    val = get_u32(buf + pos + 3 + i * 4);
+                    printf(" 0x%08x", val);
+                }
+            }
+            break;
+        default:
+            break;
+        }
+        printf("\n");
+        pos += len;
     }
-    printf("\n");
-    pos += len;
-  }
 }
 #endif
 
-static void re_emit_op(REParseState *s, int op) {
-  dbuf_putc(&s->byte_code, op);
+static void re_emit_op(REParseState *s, int op)
+{
+    dbuf_putc(&s->byte_code, op);
 }
 
 /* return the offset of the u32 value */
-static int re_emit_op_u32(REParseState *s, int op, uint32_t val) {
-  int pos;
-  dbuf_putc(&s->byte_code, op);
-  pos = s->byte_code.size;
-  dbuf_put_u32(&s->byte_code, val);
-  return pos;
+static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
+{
+    int pos;
+    dbuf_putc(&s->byte_code, op);
+    pos = s->byte_code.size;
+    dbuf_put_u32(&s->byte_code, val);
+    return pos;
 }
 
-static int re_emit_goto(REParseState *s, int op, uint32_t val) {
-  int pos;
-  dbuf_putc(&s->byte_code, op);
-  pos = s->byte_code.size;
-  dbuf_put_u32(&s->byte_code, val - (pos + 4));
-  return pos;
+static int re_emit_goto(REParseState *s, int op, uint32_t val)
+{
+    int pos;
+    dbuf_putc(&s->byte_code, op);
+    pos = s->byte_code.size;
+    dbuf_put_u32(&s->byte_code, val - (pos + 4));
+    return pos;
 }
 
-static void re_emit_op_u8(REParseState *s, int op, uint32_t val) {
-  dbuf_putc(&s->byte_code, op);
-  dbuf_putc(&s->byte_code, val);
+static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
+{
+    dbuf_putc(&s->byte_code, op);
+    dbuf_putc(&s->byte_code, val);
 }
 
-static void re_emit_op_u16(REParseState *s, int op, uint32_t val) {
-  dbuf_putc(&s->byte_code, op);
-  dbuf_put_u16(&s->byte_code, val);
+static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
+{
+    dbuf_putc(&s->byte_code, op);
+    dbuf_put_u16(&s->byte_code, val);
 }
 
-static int __attribute__((format(printf, 2, 3)))
-re_parse_error(REParseState *s, const char *fmt, ...) {
-  va_list ap;
-  va_start(ap, fmt);
-  vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
-  va_end(ap);
-  return -1;
+static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
+    va_end(ap);
+    return -1;
 }
 
-static int re_parse_out_of_memory(REParseState *s) {
-  return re_parse_error(s, "out of memory");
+static int re_parse_out_of_memory(REParseState *s)
+{
+    return re_parse_error(s, "out of memory");
 }
 
 /* If allow_overflow is false, return -1 in case of
    overflow. Otherwise return INT32_MAX. */
-static int parse_digits(const uint8_t **pp, BOOL allow_overflow) {
-  const uint8_t *p;
-  uint64_t v;
-  int c;
-
-  p = *pp;
-  v = 0;
-  for (;;) {
-    c = *p;
-    if (c < '0' || c > '9')
-      break;
-    v = v * 10 + c - '0';
-    if (v >= INT32_MAX) {
-      if (allow_overflow)
-        v = INT32_MAX;
-      else
-        return -1;
+static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
+{
+    const uint8_t *p;
+    uint64_t v;
+    int c;
+
+    p = *pp;
+    v = 0;
+    for(;;) {
+        c = *p;
+        if (c < '0' || c > '9')
+            break;
+        v = v * 10 + c - '0';
+        if (v >= INT32_MAX) {
+            if (allow_overflow)
+                v = INT32_MAX;
+            else
+                return -1;
+        }
+        p++;
     }
-    p++;
-  }
-  *pp = p;
-  return v;
+    *pp = p;
+    return v;
 }
 
-static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) {
-  const uint8_t *p;
-  p = *pp;
-  if (*p != c)
-    return re_parse_error(s, "expecting '%c'", c);
-  p++;
-  *pp = p;
-  return 0;
+static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
+{
+    const uint8_t *p;
+    p = *pp;
+    if (*p != c)
+        return re_parse_error(s, "expecting '%c'", c);
+    p++;
+    *pp = p;
+    return 0;
 }
 
 /* Parse an escape sequence, *pp points after the '\':
@@ -471,1312 +409,1307 @@ static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) {
    Return the unicode char and update *pp if recognized,
    return -1 if malformed escape,
    return -2 otherwise. */
-int lre_parse_escape(const uint8_t **pp, int allow_utf16) {
-  const uint8_t *p;
-  uint32_t c;
-
-  p = *pp;
-  c = *p++;
-  switch (c) {
-  case 'b':
-    c = '\b';
-    break;
-  case 'f':
-    c = '\f';
-    break;
-  case 'n':
-    c = '\n';
-    break;
-  case 'r':
-    c = '\r';
-    break;
-  case 't':
-    c = '\t';
-    break;
-  case 'v':
-    c = '\v';
-    break;
-  case 'x':
-  case 'u': {
-    int h, n, i;
-    uint32_t c1;
-
-    if (*p == '{' && allow_utf16) {
-      p++;
-      c = 0;
-      for (;;) {
-        h = from_hex(*p++);
-        if (h < 0)
-          return -1;
-        c = (c << 4) | h;
-        if (c > 0x10FFFF)
-          return -1;
-        if (*p == '}')
-          break;
-      }
-      p++;
-    } else {
-      if (c == 'x') {
-        n = 2;
-      } else {
-        n = 4;
-      }
-
-      c = 0;
-      for (i = 0; i < n; i++) {
-        h = from_hex(*p++);
-        if (h < 0) {
-          return -1;
-        }
-        c = (c << 4) | h;
-      }
-      if (c >= 0xd800 && c < 0xdc00 && allow_utf16 == 2 && p[0] == '\\' &&
-          p[1] == 'u') {
-        /* convert an escaped surrogate pair into a
-           unicode char */
-        c1 = 0;
-        for (i = 0; i < 4; i++) {
-          h = from_hex(p[2 + i]);
-          if (h < 0)
-            break;
-          c1 = (c1 << 4) | h;
-        }
-        if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
-          p += 6;
-          c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
-        }
-      }
-    }
-  } break;
-  case '0':
-  case '1':
-  case '2':
-  case '3':
-  case '4':
-  case '5':
-  case '6':
-  case '7':
-    c -= '0';
-    if (allow_utf16 == 2) {
-      /* only accept \0 not followed by digit */
-      if (c != 0 || is_digit(*p))
-        return -1;
-    } else {
-      /* parse a legacy octal sequence */
-      uint32_t v;
-      v = *p - '0';
-      if (v > 7)
+int lre_parse_escape(const uint8_t **pp, int allow_utf16)
+{
+    const uint8_t *p;
+    uint32_t c;
+
+    p = *pp;
+    c = *p++;
+    switch(c) {
+    case 'b':
+        c = '\b';
+        break;
+    case 'f':
+        c = '\f';
+        break;
+    case 'n':
+        c = '\n';
+        break;
+    case 'r':
+        c = '\r';
         break;
-      c = (c << 3) | v;
-      p++;
-      if (c >= 32)
+    case 't':
+        c = '\t';
+        break;
+    case 'v':
+        c = '\v';
+        break;
+    case 'x':
+    case 'u':
+        {
+            int h, n, i;
+            uint32_t c1;
+
+            if (*p == '{' && allow_utf16) {
+                p++;
+                c = 0;
+                for(;;) {
+                    h = from_hex(*p++);
+                    if (h < 0)
+                        return -1;
+                    c = (c << 4) | h;
+                    if (c > 0x10FFFF)
+                        return -1;
+                    if (*p == '}')
+                        break;
+                }
+                p++;
+            } else {
+                if (c == 'x') {
+                    n = 2;
+                } else {
+                    n = 4;
+                }
+
+                c = 0;
+                for(i = 0; i < n; i++) {
+                    h = from_hex(*p++);
+                    if (h < 0) {
+                        return -1;
+                    }
+                    c = (c << 4) | h;
+                }
+                if (is_hi_surrogate(c) &&
+                    allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
+                    /* convert an escaped surrogate pair into a
+                       unicode char */
+                    c1 = 0;
+                    for(i = 0; i < 4; i++) {
+                        h = from_hex(p[2 + i]);
+                        if (h < 0)
+                            break;
+                        c1 = (c1 << 4) | h;
+                    }
+                    if (i == 4 && is_lo_surrogate(c1)) {
+                        p += 6;
+                        c = from_surrogate(c, c1);
+                    }
+                }
+            }
+        }
         break;
-      v = *p - '0';
-      if (v > 7)
+    case '0': case '1': case '2': case '3':
+    case '4': case '5': case '6': case '7':
+        c -= '0';
+        if (allow_utf16 == 2) {
+            /* only accept \0 not followed by digit */
+            if (c != 0 || is_digit(*p))
+                return -1;
+        } else {
+            /* parse a legacy octal sequence */
+            uint32_t v;
+            v = *p - '0';
+            if (v > 7)
+                break;
+            c = (c << 3) | v;
+            p++;
+            if (c >= 32)
+                break;
+            v = *p - '0';
+            if (v > 7)
+                break;
+            c = (c << 3) | v;
+            p++;
+        }
         break;
-      c = (c << 3) | v;
-      p++;
+    default:
+        return -2;
     }
-    break;
-  default:
-    return -2;
-  }
-  *pp = p;
-  return c;
+    *pp = p;
+    return c;
 }
 
 #ifdef CONFIG_ALL_UNICODE
 /* XXX: we use the same chars for name and value */
-static BOOL is_unicode_char(int c) {
-  return ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') ||
-          (c >= 'a' && c <= 'z') || (c == '_'));
+static BOOL is_unicode_char(int c)
+{
+    return ((c >= '0' && c <= '9') ||
+            (c >= 'A' && c <= 'Z') ||
+            (c >= 'a' && c <= 'z') ||
+            (c == '_'));
 }
 
 static int parse_unicode_property(REParseState *s, CharRange *cr,
-                                  const uint8_t **pp, BOOL is_inv) {
-  const uint8_t *p;
-  char name[64], value[64];
-  char *q;
-  BOOL script_ext;
-  int ret;
-
-  p = *pp;
-  if (*p != '{')
-    return re_parse_error(s, "expecting '{' after \\p");
-  p++;
-  q = name;
-  while (is_unicode_char(*p)) {
-    if ((q - name) >= sizeof(name) - 1)
-      goto unknown_property_name;
-    *q++ = *p++;
-  }
-  *q = '\0';
-  q = value;
-  if (*p == '=') {
+                                  const uint8_t **pp, BOOL is_inv)
+{
+    const uint8_t *p;
+    char name[64], value[64];
+    char *q;
+    BOOL script_ext;
+    int ret;
+
+    p = *pp;
+    if (*p != '{')
+        return re_parse_error(s, "expecting '{' after \\p");
     p++;
+    q = name;
     while (is_unicode_char(*p)) {
-      if ((q - value) >= sizeof(value) - 1)
-        return re_parse_error(s, "unknown unicode property value");
-      *q++ = *p++;
+        if ((q - name) >= sizeof(name) - 1)
+            goto unknown_property_name;
+        *q++ = *p++;
     }
-  }
-  *q = '\0';
-  if (*p != '}')
-    return re_parse_error(s, "expecting '}'");
-  p++;
-  //    printf("name=%s value=%s\n", name, value);
-
-  if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
-    script_ext = FALSE;
-    goto do_script;
-  } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
-    script_ext = TRUE;
-  do_script:
-    cr_init(cr, s->opaque, lre_realloc);
-    ret = unicode_script(cr, value, script_ext);
-    if (ret) {
-      cr_free(cr);
-      if (ret == -2)
-        return re_parse_error(s, "unknown unicode script");
-      else
-        goto out_of_memory;
-    }
-  } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
-    cr_init(cr, s->opaque, lre_realloc);
-    ret = unicode_general_category(cr, value);
-    if (ret) {
-      cr_free(cr);
-      if (ret == -2)
-        return re_parse_error(s, "unknown unicode general category");
-      else
-        goto out_of_memory;
-    }
-  } else if (value[0] == '\0') {
-    cr_init(cr, s->opaque, lre_realloc);
-    ret = unicode_general_category(cr, name);
-    if (ret == -1) {
-      cr_free(cr);
-      goto out_of_memory;
+    *q = '\0';
+    q = value;
+    if (*p == '=') {
+        p++;
+        while (is_unicode_char(*p)) {
+            if ((q - value) >= sizeof(value) - 1)
+                return re_parse_error(s, "unknown unicode property value");
+            *q++ = *p++;
+        }
     }
-    if (ret < 0) {
-      ret = unicode_prop(cr, name);
-      if (ret) {
-        cr_free(cr);
-        if (ret == -2)
-          goto unknown_property_name;
-        else
-          goto out_of_memory;
-      }
+    *q = '\0';
+    if (*p != '}')
+        return re_parse_error(s, "expecting '}'");
+    p++;
+    //    printf("name=%s value=%s\n", name, value);
+
+    if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
+        script_ext = FALSE;
+        goto do_script;
+    } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
+        script_ext = TRUE;
+    do_script:
+        cr_init(cr, s->opaque, lre_realloc);
+        ret = unicode_script(cr, value, script_ext);
+        if (ret) {
+            cr_free(cr);
+            if (ret == -2)
+                return re_parse_error(s, "unknown unicode script");
+            else
+                goto out_of_memory;
+        }
+    } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
+        cr_init(cr, s->opaque, lre_realloc);
+        ret = unicode_general_category(cr, value);
+        if (ret) {
+            cr_free(cr);
+            if (ret == -2)
+                return re_parse_error(s, "unknown unicode general category");
+            else
+                goto out_of_memory;
+        }
+    } else if (value[0] == '\0') {
+        cr_init(cr, s->opaque, lre_realloc);
+        ret = unicode_general_category(cr, name);
+        if (ret == -1) {
+            cr_free(cr);
+            goto out_of_memory;
+        }
+        if (ret < 0) {
+            ret = unicode_prop(cr, name);
+            if (ret) {
+                cr_free(cr);
+                if (ret == -2)
+                    goto unknown_property_name;
+                else
+                    goto out_of_memory;
+            }
+        }
+    } else {
+    unknown_property_name:
+        return re_parse_error(s, "unknown unicode property name");
     }
-  } else {
-  unknown_property_name:
-    return re_parse_error(s, "unknown unicode property name");
-  }
-
-  if (is_inv) {
-    if (cr_invert(cr)) {
-      cr_free(cr);
-      return -1;
+
+    if (is_inv) {
+        if (cr_invert(cr)) {
+            cr_free(cr);
+            return -1;
+        }
     }
-  }
-  *pp = p;
-  return 0;
-out_of_memory:
-  return re_parse_out_of_memory(s);
+    *pp = p;
+    return 0;
+ out_of_memory:
+    return re_parse_out_of_memory(s);
 }
 #endif /* CONFIG_ALL_UNICODE */
 
 /* return -1 if error otherwise the character or a class range
    (CLASS_RANGE_BASE). In case of class range, 'cr' is
    initialized. Otherwise, it is ignored. */
-static int get_class_atom(REParseState *s, CharRange *cr, const uint8_t **pp,
-                          BOOL inclass) {
-  const uint8_t *p;
-  uint32_t c;
-  int ret;
+static int get_class_atom(REParseState *s, CharRange *cr,
+                          const uint8_t **pp, BOOL inclass)
+{
+    const uint8_t *p;
+    uint32_t c;
+    int ret;
 
-  p = *pp;
+    p = *pp;
 
-  c = *p;
-  switch (c) {
-  case '\\':
-    p++;
-    if (p >= s->buf_end)
-      goto unexpected_end;
-    c = *p++;
-    switch (c) {
-    case 'd':
-      c = CHAR_RANGE_d;
-      goto class_range;
-    case 'D':
-      c = CHAR_RANGE_D;
-      goto class_range;
-    case 's':
-      c = CHAR_RANGE_s;
-      goto class_range;
-    case 'S':
-      c = CHAR_RANGE_S;
-      goto class_range;
-    case 'w':
-      c = CHAR_RANGE_w;
-      goto class_range;
-    case 'W':
-      c = CHAR_RANGE_W;
-    class_range:
-      if (cr_init_char_range(s, cr, c))
-        return -1;
-      c = CLASS_RANGE_BASE;
-      break;
-    case 'c':
-      c = *p;
-      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-          (((c >= '0' && c <= '9') || c == '_') && inclass &&
-           !s->is_utf16)) { /* Annex B.1.4 */
-        c &= 0x1f;
+    c = *p;
+    switch(c) {
+    case '\\':
         p++;
-      } else if (s->is_utf16) {
-        goto invalid_escape;
-      } else {
-        /* otherwise return '\' and 'c' */
-        p--;
-        c = '\\';
-      }
-      break;
+        if (p >= s->buf_end)
+            goto unexpected_end;
+        c = *p++;
+        switch(c) {
+        case 'd':
+            c = CHAR_RANGE_d;
+            goto class_range;
+        case 'D':
+            c = CHAR_RANGE_D;
+            goto class_range;
+        case 's':
+            c = CHAR_RANGE_s;
+            goto class_range;
+        case 'S':
+            c = CHAR_RANGE_S;
+            goto class_range;
+        case 'w':
+            c = CHAR_RANGE_w;
+            goto class_range;
+        case 'W':
+            c = CHAR_RANGE_W;
+        class_range:
+            if (cr_init_char_range(s, cr, c))
+                return -1;
+            c = CLASS_RANGE_BASE;
+            break;
+        case 'c':
+            c = *p;
+            if ((c >= 'a' && c <= 'z') ||
+                (c >= 'A' && c <= 'Z') ||
+                (((c >= '0' && c <= '9') || c == '_') &&
+                 inclass && !s->is_unicode)) {   /* Annex B.1.4 */
+                c &= 0x1f;
+                p++;
+            } else if (s->is_unicode) {
+                goto invalid_escape;
+            } else {
+                /* otherwise return '\' and 'c' */
+                p--;
+                c = '\\';
+            }
+            break;
 #ifdef CONFIG_ALL_UNICODE
-    case 'p':
-    case 'P':
-      if (s->is_utf16) {
-        if (parse_unicode_property(s, cr, &p, (c == 'P')))
-          return -1;
-        c = CLASS_RANGE_BASE;
-        break;
-      }
-      /* fall thru */
+        case 'p':
+        case 'P':
+            if (s->is_unicode) {
+                if (parse_unicode_property(s, cr, &p, (c == 'P')))
+                    return -1;
+                c = CLASS_RANGE_BASE;
+                break;
+            }
+            /* fall thru */
 #endif
+        default:
+            p--;
+            ret = lre_parse_escape(&p, s->is_unicode * 2);
+            if (ret >= 0) {
+                c = ret;
+            } else {
+                if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
+                    /* always valid to escape these characters */
+                    goto normal_char;
+                } else if (s->is_unicode) {
+                invalid_escape:
+                    return re_parse_error(s, "invalid escape sequence in regular expression");
+                } else {
+                    /* just ignore the '\' */
+                    goto normal_char;
+                }
+            }
+            break;
+        }
+        break;
+    case '\0':
+        if (p >= s->buf_end) {
+        unexpected_end:
+            return re_parse_error(s, "unexpected end");
+        }
+        /* fall thru */
     default:
-      p--;
-      ret = lre_parse_escape(&p, s->is_utf16 * 2);
-      if (ret >= 0) {
-        c = ret;
-      } else {
-        if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
-          /* always valid to escape these characters */
-          goto normal_char;
-        } else if (s->is_utf16) {
-        invalid_escape:
-          return re_parse_error(
-              s, "invalid escape sequence in regular expression");
+    normal_char:
+        /* normal char */
+        if (c >= 128) {
+            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+            if ((unsigned)c > 0xffff && !s->is_unicode) {
+                /* XXX: should handle non BMP-1 code points */
+                return re_parse_error(s, "malformed unicode char");
+            }
         } else {
-          /* just ignore the '\' */
-          goto normal_char;
+            p++;
         }
-      }
-      break;
-    }
-    break;
-  case '\0':
-    if (p >= s->buf_end) {
-    unexpected_end:
-      return re_parse_error(s, "unexpected end");
-    }
-    /* fall thru */
-  default:
-  normal_char:
-    /* normal char */
-    if (c >= 128) {
-      c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-      if ((unsigned)c > 0xffff && !s->is_utf16) {
-        /* XXX: should handle non BMP-1 code points */
-        return re_parse_error(s, "malformed unicode char");
-      }
-    } else {
-      p++;
+        break;
     }
-    break;
-  }
-  *pp = p;
-  return c;
+    *pp = p;
+    return c;
 }
 
-static int re_emit_range(REParseState *s, const CharRange *cr) {
-  int len, i;
-  uint32_t high;
-
-  len = (unsigned)cr->len / 2;
-  if (len >= 65535)
-    return re_parse_error(s, "too many ranges");
-  if (len == 0) {
-    /* not sure it can really happen. Emit a match that is always
-       false */
-    re_emit_op_u32(s, REOP_char32, -1);
-  } else {
-    high = cr->points[cr->len - 1];
-    if (high == UINT32_MAX)
-      high = cr->points[cr->len - 2];
-    if (high <= 0xffff) {
-      /* can use 16 bit ranges with the conversion that 0xffff =
-         infinity */
-      re_emit_op_u16(s, REOP_range, len);
-      for (i = 0; i < cr->len; i += 2) {
-        dbuf_put_u16(&s->byte_code, cr->points[i]);
-        high = cr->points[i + 1] - 1;
-        if (high == UINT32_MAX - 1)
-          high = 0xffff;
-        dbuf_put_u16(&s->byte_code, high);
-      }
+static int re_emit_range(REParseState *s, const CharRange *cr)
+{
+    int len, i;
+    uint32_t high;
+
+    len = (unsigned)cr->len / 2;
+    if (len >= 65535)
+        return re_parse_error(s, "too many ranges");
+    if (len == 0) {
+        /* not sure it can really happen. Emit a match that is always
+           false */
+        re_emit_op_u32(s, REOP_char32, -1);
     } else {
-      re_emit_op_u16(s, REOP_range32, len);
-      for (i = 0; i < cr->len; i += 2) {
-        dbuf_put_u32(&s->byte_code, cr->points[i]);
-        dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
-      }
+        high = cr->points[cr->len - 1];
+        if (high == UINT32_MAX)
+            high = cr->points[cr->len - 2];
+        if (high <= 0xffff) {
+            /* can use 16 bit ranges with the conversion that 0xffff =
+               infinity */
+            re_emit_op_u16(s, REOP_range, len);
+            for(i = 0; i < cr->len; i += 2) {
+                dbuf_put_u16(&s->byte_code, cr->points[i]);
+                high = cr->points[i + 1] - 1;
+                if (high == UINT32_MAX - 1)
+                    high = 0xffff;
+                dbuf_put_u16(&s->byte_code, high);
+            }
+        } else {
+            re_emit_op_u16(s, REOP_range32, len);
+            for(i = 0; i < cr->len; i += 2) {
+                dbuf_put_u32(&s->byte_code, cr->points[i]);
+                dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
+            }
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 
-static int re_parse_char_class(REParseState *s, const uint8_t **pp) {
-  const uint8_t *p;
-  uint32_t c1, c2;
-  CharRange cr_s, *cr = &cr_s;
-  CharRange cr1_s, *cr1 = &cr1_s;
-  BOOL invert;
-
-  cr_init(cr, s->opaque, lre_realloc);
-  p = *pp;
-  p++; /* skip '[' */
-  invert = FALSE;
-  if (*p == '^') {
-    p++;
-    invert = TRUE;
-  }
-  for (;;) {
-    if (*p == ']')
-      break;
-    c1 = get_class_atom(s, cr1, &p, TRUE);
-    if ((int)c1 < 0)
-      goto fail;
-    if (*p == '-' && p[1] != ']') {
-      const uint8_t *p0 = p + 1;
-      if (c1 >= CLASS_RANGE_BASE) {
-        if (s->is_utf16) {
-          cr_free(cr1);
-          goto invalid_class_range;
-        }
-        /* Annex B: match '-' character */
-        goto class_atom;
-      }
-      c2 = get_class_atom(s, cr1, &p0, TRUE);
-      if ((int)c2 < 0)
-        goto fail;
-      if (c2 >= CLASS_RANGE_BASE) {
-        cr_free(cr1);
-        if (s->is_utf16) {
-          goto invalid_class_range;
+static int re_parse_char_class(REParseState *s, const uint8_t **pp)
+{
+    const uint8_t *p;
+    uint32_t c1, c2;
+    CharRange cr_s, *cr = &cr_s;
+    CharRange cr1_s, *cr1 = &cr1_s;
+    BOOL invert;
+
+    cr_init(cr, s->opaque, lre_realloc);
+    p = *pp;
+    p++;    /* skip '[' */
+
+    invert = FALSE;
+    if (*p == '^') {
+        p++;
+        invert = TRUE;
+    }
+
+    for(;;) {
+        if (*p == ']')
+            break;
+        c1 = get_class_atom(s, cr1, &p, TRUE);
+        if ((int)c1 < 0)
+            goto fail;
+        if (*p == '-' && p[1] != ']') {
+            const uint8_t *p0 = p + 1;
+            if (c1 >= CLASS_RANGE_BASE) {
+                if (s->is_unicode) {
+                    cr_free(cr1);
+                    goto invalid_class_range;
+                }
+                /* Annex B: match '-' character */
+                goto class_atom;
+            }
+            c2 = get_class_atom(s, cr1, &p0, TRUE);
+            if ((int)c2 < 0)
+                goto fail;
+            if (c2 >= CLASS_RANGE_BASE) {
+                cr_free(cr1);
+                if (s->is_unicode) {
+                    goto invalid_class_range;
+                }
+                /* Annex B: match '-' character */
+                goto class_atom;
+            }
+            p = p0;
+            if (c2 < c1) {
+            invalid_class_range:
+                re_parse_error(s, "invalid class range");
+                goto fail;
+            }
+            if (cr_union_interval(cr, c1, c2))
+                goto memory_error;
+        } else {
+        class_atom:
+            if (c1 >= CLASS_RANGE_BASE) {
+                int ret;
+                ret = cr_union1(cr, cr1->points, cr1->len);
+                cr_free(cr1);
+                if (ret)
+                    goto memory_error;
+            } else {
+                if (cr_union_interval(cr, c1, c1))
+                    goto memory_error;
+            }
         }
-        /* Annex B: match '-' character */
-        goto class_atom;
-      }
-      p = p0;
-      if (c2 < c1) {
-      invalid_class_range:
-        re_parse_error(s, "invalid class range");
-        goto fail;
-      }
-      if (cr_union_interval(cr, c1, c2))
-        goto memory_error;
-    } else {
-    class_atom:
-      if (c1 >= CLASS_RANGE_BASE) {
-        int ret;
-        ret = cr_union1(cr, cr1->points, cr1->len);
-        cr_free(cr1);
-        if (ret)
-          goto memory_error;
-      } else {
-        if (cr_union_interval(cr, c1, c1))
-          goto memory_error;
-      }
     }
-  }
-  if (s->ignore_case) {
-    if (cr_canonicalize(cr))
-      goto memory_error;
-  }
-  if (invert) {
-    if (cr_invert(cr))
-      goto memory_error;
-  }
-  if (re_emit_range(s, cr))
-    goto fail;
-  cr_free(cr);
-  p++; /* skip ']' */
-  *pp = p;
-  return 0;
-memory_error:
-  re_parse_out_of_memory(s);
-fail:
-  cr_free(cr);
-  return -1;
+    if (s->ignore_case) {
+        if (cr_regexp_canonicalize(cr, s->is_unicode))
+            goto memory_error;
+    }
+    if (invert) {
+        if (cr_invert(cr))
+            goto memory_error;
+    }
+    if (re_emit_range(s, cr))
+        goto fail;
+    cr_free(cr);
+    p++;    /* skip ']' */
+    *pp = p;
+    return 0;
+ memory_error:
+    re_parse_out_of_memory(s);
+ fail:
+    cr_free(cr);
+    return -1;
 }
 
 /* Return:
-   1 if the opcodes in bc_buf[] always advance the character pointer.
-   0 if the character pointer may not be advanced.
-   -1 if the code may depend on side effects of its previous execution
-   (backreference)
+   - true if the opcodes may not advance the char pointer
+   - false if the opcodes always advance the char pointer
 */
-static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) {
-  int pos, opcode, ret, len, i;
-  uint32_t val, last;
-  BOOL has_back_reference;
-  uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
-
-  ret = -2; /* not known yet */
-  pos = 0;
-  has_back_reference = FALSE;
-  memset(capture_bitmap, 0, sizeof(capture_bitmap));
-
-  while (pos < bc_buf_len) {
-    opcode = bc_buf[pos];
-    len = reopcode_info[opcode].size;
-    switch (opcode) {
-    case REOP_range:
-      val = get_u16(bc_buf + pos + 1);
-      len += val * 4;
-      goto simple_char;
-    case REOP_range32:
-      val = get_u16(bc_buf + pos + 1);
-      len += val * 8;
-      goto simple_char;
-    case REOP_char:
-    case REOP_char32:
-    case REOP_dot:
-    case REOP_any:
-    simple_char:
-      if (ret == -2)
-        ret = 1;
-      break;
-    case REOP_line_start:
-    case REOP_line_end:
-    case REOP_push_i32:
-    case REOP_push_char_pos:
-    case REOP_drop:
-    case REOP_word_boundary:
-    case REOP_not_word_boundary:
-    case REOP_prev:
-      /* no effect */
-      break;
-    case REOP_save_start:
-    case REOP_save_end:
-      val = bc_buf[pos + 1];
-      capture_bitmap[val] |= 1;
-      break;
-    case REOP_save_reset: {
-      val = bc_buf[pos + 1];
-      last = bc_buf[pos + 2];
-      while (val < last)
-        capture_bitmap[val++] |= 1;
-    } break;
-    case REOP_back_reference:
-    case REOP_backward_back_reference:
-      val = bc_buf[pos + 1];
-      capture_bitmap[val] |= 2;
-      has_back_reference = TRUE;
-      break;
-    default:
-      /* safe behvior: we cannot predict the outcome */
-      if (ret == -2)
-        ret = 0;
-      break;
-    }
-    pos += len;
-  }
-  if (has_back_reference) {
-    /* check if there is back reference which references a capture
-       made in the some code */
-    for (i = 0; i < CAPTURE_COUNT_MAX; i++) {
-      if (capture_bitmap[i] == 3)
-        return -1;
+static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
+{
+    int pos, opcode, len;
+    uint32_t val;
+    BOOL ret;
+
+    ret = TRUE;
+    pos = 0;
+    while (pos < bc_buf_len) {
+        opcode = bc_buf[pos];
+        len = reopcode_info[opcode].size;
+        switch(opcode) {
+        case REOP_range:
+            val = get_u16(bc_buf + pos + 1);
+            len += val * 4;
+            goto simple_char;
+        case REOP_range32:
+            val = get_u16(bc_buf + pos + 1);
+            len += val * 8;
+            goto simple_char;
+        case REOP_char:
+        case REOP_char32:
+        case REOP_dot:
+        case REOP_any:
+        simple_char:
+            ret = FALSE;
+            break;
+        case REOP_line_start:
+        case REOP_line_end:
+        case REOP_push_i32:
+        case REOP_push_char_pos:
+        case REOP_drop:
+        case REOP_word_boundary:
+        case REOP_not_word_boundary:
+        case REOP_prev:
+            /* no effect */
+            break;
+        case REOP_save_start:
+        case REOP_save_end:
+        case REOP_save_reset:
+        case REOP_back_reference:
+        case REOP_backward_back_reference:
+            break;
+        default:
+            /* safe behavior: we cannot predict the outcome */
+            return TRUE;
+        }
+        pos += len;
     }
-  }
-  if (ret == -2)
-    ret = 0;
-  return ret;
+    return ret;
 }
 
 /* return -1 if a simple quantifier cannot be used. Otherwise return
    the number of characters in the atom. */
-static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) {
-  int pos, opcode, len, count;
-  uint32_t val;
-
-  count = 0;
-  pos = 0;
-  while (pos < bc_buf_len) {
-    opcode = bc_buf[pos];
-    len = reopcode_info[opcode].size;
-    switch (opcode) {
-    case REOP_range:
-      val = get_u16(bc_buf + pos + 1);
-      len += val * 4;
-      goto simple_char;
-    case REOP_range32:
-      val = get_u16(bc_buf + pos + 1);
-      len += val * 8;
-      goto simple_char;
-    case REOP_char:
-    case REOP_char32:
-    case REOP_dot:
-    case REOP_any:
-    simple_char:
-      count++;
-      break;
-    case REOP_line_start:
-    case REOP_line_end:
-    case REOP_word_boundary:
-    case REOP_not_word_boundary:
-      break;
-    default:
-      return -1;
+static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
+{
+    int pos, opcode, len, count;
+    uint32_t val;
+
+    count = 0;
+    pos = 0;
+    while (pos < bc_buf_len) {
+        opcode = bc_buf[pos];
+        len = reopcode_info[opcode].size;
+        switch(opcode) {
+        case REOP_range:
+            val = get_u16(bc_buf + pos + 1);
+            len += val * 4;
+            goto simple_char;
+        case REOP_range32:
+            val = get_u16(bc_buf + pos + 1);
+            len += val * 8;
+            goto simple_char;
+        case REOP_char:
+        case REOP_char32:
+        case REOP_dot:
+        case REOP_any:
+        simple_char:
+            count++;
+            break;
+        case REOP_line_start:
+        case REOP_line_end:
+        case REOP_word_boundary:
+        case REOP_not_word_boundary:
+            break;
+        default:
+            return -1;
+        }
+        pos += len;
     }
-    pos += len;
-  }
-  return count;
+    return count;
 }
 
 /* '*pp' is the first char after '<' */
-static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) {
-  const uint8_t *p, *p1;
-  uint32_t c, d;
-  char *q;
-
-  p = *pp;
-  q = buf;
-  for (;;) {
-    c = *p;
-    if (c == '\\') {
-      p++;
-      if (*p != 'u')
-        return -1;
-      c = lre_parse_escape(&p, 2); // accept surrogate pairs
-    } else if (c == '>') {
-      break;
-    } else if (c >= 128) {
-      c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
-      if (c >= 0xD800 && c <= 0xDBFF) {
-        d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
-        if (d >= 0xDC00 && d <= 0xDFFF) {
-          c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00);
-          p = p1;
+static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
+{
+    const uint8_t *p, *p1;
+    uint32_t c, d;
+    char *q;
+
+    p = *pp;
+    q = buf;
+    for(;;) {
+        c = *p;
+        if (c == '\\') {
+            p++;
+            if (*p != 'u')
+                return -1;
+            c = lre_parse_escape(&p, 2); // accept surrogate pairs
+        } else if (c == '>') {
+            break;
+        } else if (c >= 128) {
+            c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+            if (is_hi_surrogate(c)) {
+                d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
+                if (is_lo_surrogate(d)) {
+                    c = from_surrogate(c, d);
+                    p = p1;
+                }
+            }
+        } else {
+            p++;
+        }
+        if (c > 0x10FFFF)
+            return -1;
+        if (q == buf) {
+            if (!lre_js_is_ident_first(c))
+                return -1;
+        } else {
+            if (!lre_js_is_ident_next(c))
+                return -1;
+        }
+        if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
+            return -1;
+        if (c < 128) {
+            *q++ = c;
+        } else {
+            q += unicode_to_utf8((uint8_t*)q, c);
         }
-      }
-    } else {
-      p++;
     }
-    if (c > 0x10FFFF)
-      return -1;
-    if (q == buf) {
-      if (!lre_js_is_ident_first(c))
-        return -1;
-    } else {
-      if (!lre_js_is_ident_next(c))
+    if (q == buf)
         return -1;
-    }
-    if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
-      return -1;
-    if (c < 128) {
-      *q++ = c;
-    } else {
-      q += unicode_to_utf8((uint8_t *)q, c);
-    }
-  }
-  if (q == buf)
-    return -1;
-  *q = '\0';
-  p++;
-  *pp = p;
-  return 0;
+    *q = '\0';
+    p++;
+    *pp = p;
+    return 0;
 }
 
 /* if capture_name = NULL: return the number of captures + 1.
    Otherwise, return the capture index corresponding to capture_name
    or -1 if none */
 static int re_parse_captures(REParseState *s, int *phas_named_captures,
-                             const char *capture_name) {
-  const uint8_t *p;
-  int capture_index;
-  char name[TMP_BUF_SIZE];
-
-  capture_index = 1;
-  *phas_named_captures = 0;
-  for (p = s->buf_start; p < s->buf_end; p++) {
-    switch (*p) {
-    case '(':
-      if (p[1] == '?') {
-        if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
-          *phas_named_captures = 1;
-          /* potential named capture */
-          if (capture_name) {
-            p += 3;
-            if (re_parse_group_name(name, sizeof(name), &p) == 0) {
-              if (!strcmp(name, capture_name))
-                return capture_index;
+                             const char *capture_name)
+{
+    const uint8_t *p;
+    int capture_index;
+    char name[TMP_BUF_SIZE];
+
+    capture_index = 1;
+    *phas_named_captures = 0;
+    for (p = s->buf_start; p < s->buf_end; p++) {
+        switch (*p) {
+        case '(':
+            if (p[1] == '?') {
+                if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
+                    *phas_named_captures = 1;
+                    /* potential named capture */
+                    if (capture_name) {
+                        p += 3;
+                        if (re_parse_group_name(name, sizeof(name), &p) == 0) {
+                            if (!strcmp(name, capture_name))
+                                return capture_index;
+                        }
+                    }
+                    capture_index++;
+                    if (capture_index >= CAPTURE_COUNT_MAX)
+                        goto done;
+                }
+            } else {
+                capture_index++;
+                if (capture_index >= CAPTURE_COUNT_MAX)
+                    goto done;
+            }
+            break;
+        case '\\':
+            p++;
+            break;
+        case '[':
+            for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
+                if (*p == '\\')
+                    p++;
             }
-          }
-          capture_index++;
-          if (capture_index >= CAPTURE_COUNT_MAX)
-            goto done;
+            break;
         }
-      } else {
-        capture_index++;
-        if (capture_index >= CAPTURE_COUNT_MAX)
-          goto done;
-      }
-      break;
-    case '\\':
-      p++;
-      break;
-    case '[':
-      for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
-        if (*p == '\\')
-          p++;
-      }
-      break;
     }
-  }
-done:
-  if (capture_name)
-    return -1;
-  else
-    return capture_index;
+ done:
+    if (capture_name)
+        return -1;
+    else
+        return capture_index;
 }
 
-static int re_count_captures(REParseState *s) {
-  if (s->total_capture_count < 0) {
-    s->total_capture_count = re_parse_captures(s, &s->has_named_captures, NULL);
-  }
-  return s->total_capture_count;
+static int re_count_captures(REParseState *s)
+{
+    if (s->total_capture_count < 0) {
+        s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
+                                                   NULL);
+    }
+    return s->total_capture_count;
 }
 
-static BOOL re_has_named_captures(REParseState *s) {
-  if (s->has_named_captures < 0)
-    re_count_captures(s);
-  return s->has_named_captures;
+static BOOL re_has_named_captures(REParseState *s)
+{
+    if (s->has_named_captures < 0)
+        re_count_captures(s);
+    return s->has_named_captures;
 }
 
-static int find_group_name(REParseState *s, const char *name) {
-  const char *p, *buf_end;
-  size_t len, name_len;
-  int capture_index;
-
-  name_len = strlen(name);
-  p = (char *)s->group_names.buf;
-  buf_end = (char *)s->group_names.buf + s->group_names.size;
-  capture_index = 1;
-  while (p < buf_end) {
-    len = strlen(p);
-    if (len == name_len && memcmp(name, p, name_len) == 0)
-      return capture_index;
-    p += len + 1;
-    capture_index++;
-  }
-  return -1;
+static int find_group_name(REParseState *s, const char *name)
+{
+    const char *p, *buf_end;
+    size_t len, name_len;
+    int capture_index;
+
+    p = (char *)s->group_names.buf;
+    if (!p) return -1;
+    buf_end = (char *)s->group_names.buf + s->group_names.size;
+    name_len = strlen(name);
+    capture_index = 1;
+    while (p < buf_end) {
+        len = strlen(p);
+        if (len == name_len && memcmp(name, p, name_len) == 0)
+            return capture_index;
+        p += len + 1;
+        capture_index++;
+    }
+    return -1;
 }
 
 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
 
-static int re_parse_term(REParseState *s, BOOL is_backward_dir) {
-  const uint8_t *p;
-  int c, last_atom_start, quant_min, quant_max, last_capture_count;
-  BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
-  CharRange cr_s, *cr = &cr_s;
-
-  last_atom_start = -1;
-  last_capture_count = 0;
-  p = s->buf_ptr;
-  c = *p;
-  switch (c) {
-  case '^':
-    p++;
-    re_emit_op(s, REOP_line_start);
-    break;
-  case '$':
-    p++;
-    re_emit_op(s, REOP_line_end);
-    break;
-  case '.':
-    p++;
-    last_atom_start = s->byte_code.size;
-    last_capture_count = s->capture_count;
-    if (is_backward_dir)
-      re_emit_op(s, REOP_prev);
-    re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
-    if (is_backward_dir)
-      re_emit_op(s, REOP_prev);
-    break;
-  case '{':
-    if (s->is_utf16) {
-      return re_parse_error(s, "syntax error");
-    } else if (!is_digit(p[1])) {
-      /* Annex B: we accept '{' not followed by digits as a
-         normal atom */
-      goto parse_class_atom;
-    } else {
-      const uint8_t *p1 = p + 1;
-      /* Annex B: error if it is like a repetition count */
-      parse_digits(&p1, TRUE);
-      if (*p1 == ',') {
-        p1++;
-        if (is_digit(*p1)) {
-          parse_digits(&p1, TRUE);
-        }
-      }
-      if (*p1 != '}') {
-        goto parse_class_atom;
-      }
-    }
-    /* fall thru */
-  case '*':
-  case '+':
-  case '?':
-    return re_parse_error(s, "nothing to repeat");
-  case '(':
-    if (p[1] == '?') {
-      if (p[2] == ':') {
-        p += 3;
+static int re_parse_term(REParseState *s, BOOL is_backward_dir)
+{
+    const uint8_t *p;
+    int c, last_atom_start, quant_min, quant_max, last_capture_count;
+    BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
+    CharRange cr_s, *cr = &cr_s;
+
+    last_atom_start = -1;
+    last_capture_count = 0;
+    p = s->buf_ptr;
+    c = *p;
+    switch(c) {
+    case '^':
+        p++;
+        re_emit_op(s, REOP_line_start);
+        break;
+    case '$':
+        p++;
+        re_emit_op(s, REOP_line_end);
+        break;
+    case '.':
+        p++;
         last_atom_start = s->byte_code.size;
         last_capture_count = s->capture_count;
-        s->buf_ptr = p;
-        if (re_parse_disjunction(s, is_backward_dir))
-          return -1;
-        p = s->buf_ptr;
-        if (re_parse_expect(s, &p, ')'))
-          return -1;
-      } else if ((p[2] == '=' || p[2] == '!')) {
-        is_neg = (p[2] == '!');
-        is_backward_lookahead = FALSE;
-        p += 3;
-        goto lookahead;
-      } else if (p[2] == '<' && (p[3] == '=' || p[3] == '!')) {
-        int pos;
-        is_neg = (p[3] == '!');
-        is_backward_lookahead = TRUE;
-        p += 4;
-        /* lookahead */
-      lookahead:
-        /* Annex B allows lookahead to be used as an atom for
-           the quantifiers */
-        if (!s->is_utf16 && !is_backward_lookahead) {
-          last_atom_start = s->byte_code.size;
-          last_capture_count = s->capture_count;
-        }
-        pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
-        s->buf_ptr = p;
-        if (re_parse_disjunction(s, is_backward_lookahead))
-          return -1;
-        p = s->buf_ptr;
-        if (re_parse_expect(s, &p, ')'))
-          return -1;
-        re_emit_op(s, REOP_match);
-        /* jump after the 'match' after the lookahead is successful */
-        if (dbuf_error(&s->byte_code))
-          return -1;
-        put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
-      } else if (p[2] == '<') {
-        p += 3;
-        if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), &p)) {
-          return re_parse_error(s, "invalid group name");
-        }
-        if (find_group_name(s, s->u.tmp_buf) > 0) {
-          return re_parse_error(s, "duplicate group name");
-        }
-        /* group name with a trailing zero */
-        dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
-                 strlen(s->u.tmp_buf) + 1);
-        s->has_named_captures = 1;
-        goto parse_capture;
-      } else {
-        return re_parse_error(s, "invalid group");
-      }
-    } else {
-      int capture_index;
-      p++;
-      /* capture without group name */
-      dbuf_putc(&s->group_names, 0);
-    parse_capture:
-      if (s->capture_count >= CAPTURE_COUNT_MAX)
-        return re_parse_error(s, "too many captures");
-      last_atom_start = s->byte_code.size;
-      last_capture_count = s->capture_count;
-      capture_index = s->capture_count++;
-      re_emit_op_u8(s, REOP_save_start + is_backward_dir, capture_index);
-
-      s->buf_ptr = p;
-      if (re_parse_disjunction(s, is_backward_dir))
-        return -1;
-      p = s->buf_ptr;
-
-      re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, capture_index);
-
-      if (re_parse_expect(s, &p, ')'))
-        return -1;
-    }
-    break;
-  case '\\':
-    switch (p[1]) {
-    case 'b':
-    case 'B':
-      re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
-      p += 2;
-      break;
-    case 'k': {
-      const uint8_t *p1;
-      int dummy_res;
-
-      p1 = p;
-      if (p1[2] != '<') {
-        /* annex B: we tolerate invalid group names in non
-           unicode mode if there is no named capture
-           definition */
-        if (s->is_utf16 || re_has_named_captures(s))
-          return re_parse_error(s, "expecting group name");
-        else
-          goto parse_class_atom;
-      }
-      p1 += 3;
-      if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), &p1)) {
-        if (s->is_utf16 || re_has_named_captures(s))
-          return re_parse_error(s, "invalid group name");
-        else
-          goto parse_class_atom;
-      }
-      c = find_group_name(s, s->u.tmp_buf);
-      if (c < 0) {
-        /* no capture name parsed before, try to look
-           after (inefficient, but hopefully not common */
-        c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
-        if (c < 0) {
-          if (s->is_utf16 || re_has_named_captures(s))
-            return re_parse_error(s, "group name not defined");
-          else
+        if (is_backward_dir)
+            re_emit_op(s, REOP_prev);
+        re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
+        if (is_backward_dir)
+            re_emit_op(s, REOP_prev);
+        break;
+    case '{':
+        if (s->is_unicode) {
+            return re_parse_error(s, "syntax error");
+        } else if (!is_digit(p[1])) {
+            /* Annex B: we accept '{' not followed by digits as a
+               normal atom */
             goto parse_class_atom;
-        }
-      }
-      p = p1;
-    }
-      goto emit_back_reference;
-    case '0':
-      p += 2;
-      c = 0;
-      if (s->is_utf16) {
-        if (is_digit(*p)) {
-          return re_parse_error(s,
-                                "invalid decimal escape in regular expression");
-        }
-      } else {
-        /* Annex B.1.4: accept legacy octal */
-        if (*p >= '0' && *p <= '7') {
-          c = *p++ - '0';
-          if (*p >= '0' && *p <= '7') {
-            c = (c << 3) + *p++ - '0';
-          }
-        }
-      }
-      goto normal_char;
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9': {
-      const uint8_t *q = ++p;
-
-      c = parse_digits(&p, FALSE);
-      if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
-        if (!s->is_utf16) {
-          /* Annex B.1.4: accept legacy octal */
-          p = q;
-          if (*p <= '7') {
-            c = 0;
-            if (*p <= '3')
-              c = *p++ - '0';
-            if (*p >= '0' && *p <= '7') {
-              c = (c << 3) + *p++ - '0';
-              if (*p >= '0' && *p <= '7') {
-                c = (c << 3) + *p++ - '0';
-              }
+        } else {
+            const uint8_t *p1 = p + 1;
+            /* Annex B: error if it is like a repetition count */
+            parse_digits(&p1, TRUE);
+            if (*p1 == ',') {
+                p1++;
+                if (is_digit(*p1)) {
+                    parse_digits(&p1, TRUE);
+                }
+            }
+            if (*p1 != '}') {
+                goto parse_class_atom;
             }
-          } else {
-            c = *p++;
-          }
-          goto normal_char;
         }
-        return re_parse_error(
-            s, "back reference out of range in regular expression");
-      }
-    emit_back_reference:
-      last_atom_start = s->byte_code.size;
-      last_capture_count = s->capture_count;
-      re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
-    } break;
-    default:
-      goto parse_class_atom;
-    }
-    break;
-  case '[':
-    last_atom_start = s->byte_code.size;
-    last_capture_count = s->capture_count;
-    if (is_backward_dir)
-      re_emit_op(s, REOP_prev);
-    if (re_parse_char_class(s, &p))
-      return -1;
-    if (is_backward_dir)
-      re_emit_op(s, REOP_prev);
-    break;
-  case ']':
-  case '}':
-    if (s->is_utf16)
-      return re_parse_error(s, "syntax error");
-    goto parse_class_atom;
-  default:
-  parse_class_atom:
-    c = get_class_atom(s, cr, &p, FALSE);
-    if ((int)c < 0)
-      return -1;
-  normal_char:
-    last_atom_start = s->byte_code.size;
-    last_capture_count = s->capture_count;
-    if (is_backward_dir)
-      re_emit_op(s, REOP_prev);
-    if (c >= CLASS_RANGE_BASE) {
-      int ret;
-      /* Note: canonicalization is not needed */
-      ret = re_emit_range(s, cr);
-      cr_free(cr);
-      if (ret)
-        return -1;
-    } else {
-      if (s->ignore_case)
-        c = lre_canonicalize(c, s->is_utf16);
-      if (c <= 0xffff)
-        re_emit_op_u16(s, REOP_char, c);
-      else
-        re_emit_op_u32(s, REOP_char32, c);
-    }
-    if (is_backward_dir)
-      re_emit_op(s, REOP_prev);
-    break;
-  }
-
-  /* quantifier */
-  if (last_atom_start >= 0) {
-    c = *p;
-    switch (c) {
+        /* fall thru */
     case '*':
-      p++;
-      quant_min = 0;
-      quant_max = INT32_MAX;
-      goto quantifier;
     case '+':
-      p++;
-      quant_min = 1;
-      quant_max = INT32_MAX;
-      goto quantifier;
     case '?':
-      p++;
-      quant_min = 0;
-      quant_max = 1;
-      goto quantifier;
-    case '{': {
-      const uint8_t *p1 = p;
-      /* As an extension (see ES6 annex B), we accept '{' not
-         followed by digits as a normal atom */
-      if (!is_digit(p[1])) {
-        if (s->is_utf16)
-          goto invalid_quant_count;
-        break;
-      }
-      p++;
-      quant_min = parse_digits(&p, TRUE);
-      quant_max = quant_min;
-      if (*p == ',') {
-        p++;
-        if (is_digit(*p)) {
-          quant_max = parse_digits(&p, TRUE);
-          if (quant_max < quant_min) {
-          invalid_quant_count:
-            return re_parse_error(s, "invalid repetition count");
-          }
+        return re_parse_error(s, "nothing to repeat");
+    case '(':
+        if (p[1] == '?') {
+            if (p[2] == ':') {
+                p += 3;
+                last_atom_start = s->byte_code.size;
+                last_capture_count = s->capture_count;
+                s->buf_ptr = p;
+                if (re_parse_disjunction(s, is_backward_dir))
+                    return -1;
+                p = s->buf_ptr;
+                if (re_parse_expect(s, &p, ')'))
+                    return -1;
+            } else if ((p[2] == '=' || p[2] == '!')) {
+                is_neg = (p[2] == '!');
+                is_backward_lookahead = FALSE;
+                p += 3;
+                goto lookahead;
+            } else if (p[2] == '<' &&
+                       (p[3] == '=' || p[3] == '!')) {
+                int pos;
+                is_neg = (p[3] == '!');
+                is_backward_lookahead = TRUE;
+                p += 4;
+                /* lookahead */
+            lookahead:
+                /* Annex B allows lookahead to be used as an atom for
+                   the quantifiers */
+                if (!s->is_unicode && !is_backward_lookahead)  {
+                    last_atom_start = s->byte_code.size;
+                    last_capture_count = s->capture_count;
+                }
+                pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
+                s->buf_ptr = p;
+                if (re_parse_disjunction(s, is_backward_lookahead))
+                    return -1;
+                p = s->buf_ptr;
+                if (re_parse_expect(s, &p, ')'))
+                    return -1;
+                re_emit_op(s, REOP_match);
+                /* jump after the 'match' after the lookahead is successful */
+                if (dbuf_error(&s->byte_code))
+                    return -1;
+                put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
+            } else if (p[2] == '<') {
+                p += 3;
+                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
+                                        &p)) {
+                    return re_parse_error(s, "invalid group name");
+                }
+                if (find_group_name(s, s->u.tmp_buf) > 0) {
+                    return re_parse_error(s, "duplicate group name");
+                }
+                /* group name with a trailing zero */
+                dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
+                         strlen(s->u.tmp_buf) + 1);
+                s->has_named_captures = 1;
+                goto parse_capture;
+            } else {
+                return re_parse_error(s, "invalid group");
+            }
         } else {
-          quant_max = INT32_MAX; /* infinity */
+            int capture_index;
+            p++;
+            /* capture without group name */
+            dbuf_putc(&s->group_names, 0);
+        parse_capture:
+            if (s->capture_count >= CAPTURE_COUNT_MAX)
+                return re_parse_error(s, "too many captures");
+            last_atom_start = s->byte_code.size;
+            last_capture_count = s->capture_count;
+            capture_index = s->capture_count++;
+            re_emit_op_u8(s, REOP_save_start + is_backward_dir,
+                          capture_index);
+
+            s->buf_ptr = p;
+            if (re_parse_disjunction(s, is_backward_dir))
+                return -1;
+            p = s->buf_ptr;
+
+            re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
+                          capture_index);
+
+            if (re_parse_expect(s, &p, ')'))
+                return -1;
         }
-      }
-      if (*p != '}' && !s->is_utf16) {
-        /* Annex B: normal atom if invalid '{' syntax */
-        p = p1;
         break;
-      }
-      if (re_parse_expect(s, &p, '}'))
-        return -1;
-    }
-    quantifier:
-      greedy = TRUE;
-      if (*p == '?') {
-        p++;
-        greedy = FALSE;
-      }
-      if (last_atom_start < 0) {
-        return re_parse_error(s, "nothing to repeat");
-      }
-      if (greedy) {
-        int len, pos;
-
-        if (quant_max > 0) {
-          /* specific optimization for simple quantifiers */
-          if (dbuf_error(&s->byte_code))
-            goto out_of_memory;
-          len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
-                                        s->byte_code.size - last_atom_start);
-          if (len > 0) {
-            re_emit_op(s, REOP_match);
-
-            if (dbuf_insert(&s->byte_code, last_atom_start, 17))
-              goto out_of_memory;
-            pos = last_atom_start;
-            s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
-            put_u32(&s->byte_code.buf[pos],
-                    s->byte_code.size - last_atom_start - 17);
-            pos += 4;
-            put_u32(&s->byte_code.buf[pos], quant_min);
-            pos += 4;
-            put_u32(&s->byte_code.buf[pos], quant_max);
-            pos += 4;
-            put_u32(&s->byte_code.buf[pos], len);
-            pos += 4;
-            goto done;
-          }
-        }
-
-        if (dbuf_error(&s->byte_code))
-          goto out_of_memory;
-        add_zero_advance_check =
-            (re_check_advance(s->byte_code.buf + last_atom_start,
-                              s->byte_code.size - last_atom_start) == 0);
-      } else {
-        add_zero_advance_check = FALSE;
-      }
-
-      {
-        int len, pos;
-        len = s->byte_code.size - last_atom_start;
-        if (quant_min == 0) {
-          /* need to reset the capture in case the atom is
-             not executed */
-          if (last_capture_count != s->capture_count) {
-            if (dbuf_insert(&s->byte_code, last_atom_start, 3))
-              goto out_of_memory;
-            s->byte_code.buf[last_atom_start++] = REOP_save_reset;
-            s->byte_code.buf[last_atom_start++] = last_capture_count;
-            s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
-          }
-          if (quant_max == 0) {
-            s->byte_code.size = last_atom_start;
-          } else if (quant_max == 1) {
-            if (dbuf_insert(&s->byte_code, last_atom_start, 5))
-              goto out_of_memory;
-            s->byte_code.buf[last_atom_start] = REOP_split_goto_first + greedy;
-            put_u32(s->byte_code.buf + last_atom_start + 1, len);
-          } else if (quant_max == INT32_MAX) {
-            if (dbuf_insert(&s->byte_code, last_atom_start,
-                            5 + add_zero_advance_check))
-              goto out_of_memory;
-            s->byte_code.buf[last_atom_start] = REOP_split_goto_first + greedy;
-            put_u32(s->byte_code.buf + last_atom_start + 1,
-                    len + 5 + add_zero_advance_check);
-            if (add_zero_advance_check) {
-              /* avoid infinite loop by stoping the
-                 recursion if no advance was made in the
-                 atom (only works if the atom has no
-                 side effect) */
-              s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
-              re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
+    case '\\':
+        switch(p[1]) {
+        case 'b':
+        case 'B':
+            re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
+            p += 2;
+            break;
+        case 'k':
+            {
+                const uint8_t *p1;
+                int dummy_res;
+
+                p1 = p;
+                if (p1[2] != '<') {
+                    /* annex B: we tolerate invalid group names in non
+                       unicode mode if there is no named capture
+                       definition */
+                    if (s->is_unicode || re_has_named_captures(s))
+                        return re_parse_error(s, "expecting group name");
+                    else
+                        goto parse_class_atom;
+                }
+                p1 += 3;
+                if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
+                                        &p1)) {
+                    if (s->is_unicode || re_has_named_captures(s))
+                        return re_parse_error(s, "invalid group name");
+                    else
+                        goto parse_class_atom;
+                }
+                c = find_group_name(s, s->u.tmp_buf);
+                if (c < 0) {
+                    /* no capture name parsed before, try to look
+                       after (inefficient, but hopefully not common */
+                    c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
+                    if (c < 0) {
+                        if (s->is_unicode || re_has_named_captures(s))
+                            return re_parse_error(s, "group name not defined");
+                        else
+                            goto parse_class_atom;
+                    }
+                }
+                p = p1;
+            }
+            goto emit_back_reference;
+        case '0':
+            p += 2;
+            c = 0;
+            if (s->is_unicode) {
+                if (is_digit(*p)) {
+                    return re_parse_error(s, "invalid decimal escape in regular expression");
+                }
             } else {
-              re_emit_goto(s, REOP_goto, last_atom_start);
+                /* Annex B.1.4: accept legacy octal */
+                if (*p >= '0' && *p <= '7') {
+                    c = *p++ - '0';
+                    if (*p >= '0' && *p <= '7') {
+                        c = (c << 3) + *p++ - '0';
+                    }
+                }
+            }
+            goto normal_char;
+        case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8':
+        case '9':
+            {
+                const uint8_t *q = ++p;
+
+                c = parse_digits(&p, FALSE);
+                if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
+                    if (!s->is_unicode) {
+                        /* Annex B.1.4: accept legacy octal */
+                        p = q;
+                        if (*p <= '7') {
+                            c = 0;
+                            if (*p <= '3')
+                                c = *p++ - '0';
+                            if (*p >= '0' && *p <= '7') {
+                                c = (c << 3) + *p++ - '0';
+                                if (*p >= '0' && *p <= '7') {
+                                    c = (c << 3) + *p++ - '0';
+                                }
+                            }
+                        } else {
+                            c = *p++;
+                        }
+                        goto normal_char;
+                    }
+                    return re_parse_error(s, "back reference out of range in regular expression");
+                }
+            emit_back_reference:
+                last_atom_start = s->byte_code.size;
+                last_capture_count = s->capture_count;
+                re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
             }
-          } else {
-            if (dbuf_insert(&s->byte_code, last_atom_start, 10))
-              goto out_of_memory;
-            pos = last_atom_start;
-            s->byte_code.buf[pos++] = REOP_push_i32;
-            put_u32(s->byte_code.buf + pos, quant_max);
-            pos += 4;
-            s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
-            put_u32(s->byte_code.buf + pos, len + 5);
-            re_emit_goto(s, REOP_loop, last_atom_start + 5);
-            re_emit_op(s, REOP_drop);
-          }
-        } else if (quant_min == 1 && quant_max == INT32_MAX &&
-                   !add_zero_advance_check) {
-          re_emit_goto(s, REOP_split_next_first - greedy, last_atom_start);
+            break;
+        default:
+            goto parse_class_atom;
+        }
+        break;
+    case '[':
+        last_atom_start = s->byte_code.size;
+        last_capture_count = s->capture_count;
+        if (is_backward_dir)
+            re_emit_op(s, REOP_prev);
+        if (re_parse_char_class(s, &p))
+            return -1;
+        if (is_backward_dir)
+            re_emit_op(s, REOP_prev);
+        break;
+    case ']':
+    case '}':
+        if (s->is_unicode)
+            return re_parse_error(s, "syntax error");
+        goto parse_class_atom;
+    default:
+    parse_class_atom:
+        c = get_class_atom(s, cr, &p, FALSE);
+        if ((int)c < 0)
+            return -1;
+    normal_char:
+        last_atom_start = s->byte_code.size;
+        last_capture_count = s->capture_count;
+        if (is_backward_dir)
+            re_emit_op(s, REOP_prev);
+        if (c >= CLASS_RANGE_BASE) {
+            int ret;
+            /* Note: canonicalization is not needed */
+            ret = re_emit_range(s, cr);
+            cr_free(cr);
+            if (ret)
+                return -1;
         } else {
-          if (quant_min == 1) {
-            /* nothing to add */
-          } else {
-            if (dbuf_insert(&s->byte_code, last_atom_start, 5))
-              goto out_of_memory;
-            s->byte_code.buf[last_atom_start] = REOP_push_i32;
-            put_u32(s->byte_code.buf + last_atom_start + 1, quant_min);
-            last_atom_start += 5;
-            re_emit_goto(s, REOP_loop, last_atom_start);
-            re_emit_op(s, REOP_drop);
-          }
-          if (quant_max == INT32_MAX) {
-            pos = s->byte_code.size;
-            re_emit_op_u32(s, REOP_split_goto_first + greedy,
-                           len + 5 + add_zero_advance_check);
-            if (add_zero_advance_check)
-              re_emit_op(s, REOP_push_char_pos);
-            /* copy the atom */
-            dbuf_put_self(&s->byte_code, last_atom_start, len);
-            if (add_zero_advance_check)
-              re_emit_goto(s, REOP_bne_char_pos, pos);
+            if (s->ignore_case)
+                c = lre_canonicalize(c, s->is_unicode);
+            if (c <= 0xffff)
+                re_emit_op_u16(s, REOP_char, c);
             else
-              re_emit_goto(s, REOP_goto, pos);
-          } else if (quant_max > quant_min) {
-            re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
-            pos = s->byte_code.size;
-            re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
-            /* copy the atom */
-            dbuf_put_self(&s->byte_code, last_atom_start, len);
-
-            re_emit_goto(s, REOP_loop, pos);
-            re_emit_op(s, REOP_drop);
-          }
+                re_emit_op_u32(s, REOP_char32, c);
         }
-        last_atom_start = -1;
-      }
-      break;
-    default:
-      break;
+        if (is_backward_dir)
+            re_emit_op(s, REOP_prev);
+        break;
     }
-  }
-done:
-  s->buf_ptr = p;
-  return 0;
-out_of_memory:
-  return re_parse_out_of_memory(s);
+
+    /* quantifier */
+    if (last_atom_start >= 0) {
+        c = *p;
+        switch(c) {
+        case '*':
+            p++;
+            quant_min = 0;
+            quant_max = INT32_MAX;
+            goto quantifier;
+        case '+':
+            p++;
+            quant_min = 1;
+            quant_max = INT32_MAX;
+            goto quantifier;
+        case '?':
+            p++;
+            quant_min = 0;
+            quant_max = 1;
+            goto quantifier;
+        case '{':
+            {
+                const uint8_t *p1 = p;
+                /* As an extension (see ES6 annex B), we accept '{' not
+                   followed by digits as a normal atom */
+                if (!is_digit(p[1])) {
+                    if (s->is_unicode)
+                        goto invalid_quant_count;
+                    break;
+                }
+                p++;
+                quant_min = parse_digits(&p, TRUE);
+                quant_max = quant_min;
+                if (*p == ',') {
+                    p++;
+                    if (is_digit(*p)) {
+                        quant_max = parse_digits(&p, TRUE);
+                        if (quant_max < quant_min) {
+                        invalid_quant_count:
+                            return re_parse_error(s, "invalid repetition count");
+                        }
+                    } else {
+                        quant_max = INT32_MAX; /* infinity */
+                    }
+                }
+                if (*p != '}' && !s->is_unicode) {
+                    /* Annex B: normal atom if invalid '{' syntax */
+                    p = p1;
+                    break;
+                }
+                if (re_parse_expect(s, &p, '}'))
+                    return -1;
+            }
+        quantifier:
+            greedy = TRUE;
+            if (*p == '?') {
+                p++;
+                greedy = FALSE;
+            }
+            if (last_atom_start < 0) {
+                return re_parse_error(s, "nothing to repeat");
+            }
+            if (greedy) {
+                int len, pos;
+
+                if (quant_max > 0) {
+                    /* specific optimization for simple quantifiers */
+                    if (dbuf_error(&s->byte_code))
+                        goto out_of_memory;
+                    len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
+                                                 s->byte_code.size - last_atom_start);
+                    if (len > 0) {
+                        re_emit_op(s, REOP_match);
+
+                        if (dbuf_insert(&s->byte_code, last_atom_start, 17))
+                            goto out_of_memory;
+                        pos = last_atom_start;
+                        s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
+                        put_u32(&s->byte_code.buf[pos],
+                                s->byte_code.size - last_atom_start - 17);
+                        pos += 4;
+                        put_u32(&s->byte_code.buf[pos], quant_min);
+                        pos += 4;
+                        put_u32(&s->byte_code.buf[pos], quant_max);
+                        pos += 4;
+                        put_u32(&s->byte_code.buf[pos], len);
+                        pos += 4;
+                        goto done;
+                    }
+                }
+
+                if (dbuf_error(&s->byte_code))
+                    goto out_of_memory;
+            }
+            /* the spec tells that if there is no advance when
+               running the atom after the first quant_min times,
+               then there is no match. We remove this test when we
+               are sure the atom always advances the position. */
+            add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
+                                                           s->byte_code.size - last_atom_start);
+
+            {
+                int len, pos;
+                len = s->byte_code.size - last_atom_start;
+                if (quant_min == 0) {
+                    /* need to reset the capture in case the atom is
+                       not executed */
+                    if (last_capture_count != s->capture_count) {
+                        if (dbuf_insert(&s->byte_code, last_atom_start, 3))
+                            goto out_of_memory;
+                        s->byte_code.buf[last_atom_start++] = REOP_save_reset;
+                        s->byte_code.buf[last_atom_start++] = last_capture_count;
+                        s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
+                    }
+                    if (quant_max == 0) {
+                        s->byte_code.size = last_atom_start;
+                    } else if (quant_max == 1 || quant_max == INT32_MAX) {
+                        BOOL has_goto = (quant_max == INT32_MAX);
+                        if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
+                            goto out_of_memory;
+                        s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
+                            greedy;
+                        put_u32(s->byte_code.buf + last_atom_start + 1,
+                                len + 5 * has_goto + add_zero_advance_check * 2);
+                        if (add_zero_advance_check) {
+                            s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
+                            re_emit_op(s, REOP_check_advance);
+                        }
+                        if (has_goto)
+                            re_emit_goto(s, REOP_goto, last_atom_start);
+                    } else {
+                        if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
+                            goto out_of_memory;
+                        pos = last_atom_start;
+                        s->byte_code.buf[pos++] = REOP_push_i32;
+                        put_u32(s->byte_code.buf + pos, quant_max);
+                        pos += 4;
+                        s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
+                        put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
+                        pos += 4;
+                        if (add_zero_advance_check) {
+                            s->byte_code.buf[pos++] = REOP_push_char_pos;
+                            re_emit_op(s, REOP_check_advance);
+                        }
+                        re_emit_goto(s, REOP_loop, last_atom_start + 5);
+                        re_emit_op(s, REOP_drop);
+                    }
+                } else if (quant_min == 1 && quant_max == INT32_MAX &&
+                           !add_zero_advance_check) {
+                    re_emit_goto(s, REOP_split_next_first - greedy,
+                                 last_atom_start);
+                } else {
+                    if (quant_min == 1) {
+                        /* nothing to add */
+                    } else {
+                        if (dbuf_insert(&s->byte_code, last_atom_start, 5))
+                            goto out_of_memory;
+                        s->byte_code.buf[last_atom_start] = REOP_push_i32;
+                        put_u32(s->byte_code.buf + last_atom_start + 1,
+                                quant_min);
+                        last_atom_start += 5;
+                        re_emit_goto(s, REOP_loop, last_atom_start);
+                        re_emit_op(s, REOP_drop);
+                    }
+                    if (quant_max == INT32_MAX) {
+                        pos = s->byte_code.size;
+                        re_emit_op_u32(s, REOP_split_goto_first + greedy,
+                                       len + 5 + add_zero_advance_check * 2);
+                        if (add_zero_advance_check)
+                            re_emit_op(s, REOP_push_char_pos);
+                        /* copy the atom */
+                        dbuf_put_self(&s->byte_code, last_atom_start, len);
+                        if (add_zero_advance_check)
+                            re_emit_op(s, REOP_check_advance);
+                        re_emit_goto(s, REOP_goto, pos);
+                    } else if (quant_max > quant_min) {
+                        re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
+                        pos = s->byte_code.size;
+                        re_emit_op_u32(s, REOP_split_goto_first + greedy,
+                                       len + 5 + add_zero_advance_check * 2);
+                        if (add_zero_advance_check)
+                            re_emit_op(s, REOP_push_char_pos);
+                        /* copy the atom */
+                        dbuf_put_self(&s->byte_code, last_atom_start, len);
+                        if (add_zero_advance_check)
+                            re_emit_op(s, REOP_check_advance);
+                        re_emit_goto(s, REOP_loop, pos);
+                        re_emit_op(s, REOP_drop);
+                    }
+                }
+                last_atom_start = -1;
+            }
+            break;
+        default:
+            break;
+        }
+    }
+ done:
+    s->buf_ptr = p;
+    return 0;
+ out_of_memory:
+    return re_parse_out_of_memory(s);
 }
 
-static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) {
-  const uint8_t *p;
-  int ret;
-  size_t start, term_start, end, term_size;
+static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
+{
+    const uint8_t *p;
+    int ret;
+    size_t start, term_start, end, term_size;
 
-  start = s->byte_code.size;
-  for (;;) {
-    p = s->buf_ptr;
-    if (p >= s->buf_end)
-      break;
-    if (*p == '|' || *p == ')')
-      break;
-    term_start = s->byte_code.size;
-    ret = re_parse_term(s, is_backward_dir);
-    if (ret)
-      return ret;
-    if (is_backward_dir) {
-      /* reverse the order of the terms (XXX: inefficient, but
-         speed is not really critical here) */
-      end = s->byte_code.size;
-      term_size = end - term_start;
-      if (dbuf_realloc(&s->byte_code, end + term_size))
-        return -1;
-      memmove(s->byte_code.buf + start + term_size, s->byte_code.buf + start,
-              end - start);
-      memcpy(s->byte_code.buf + start, s->byte_code.buf + end, term_size);
+    start = s->byte_code.size;
+    for(;;) {
+        p = s->buf_ptr;
+        if (p >= s->buf_end)
+            break;
+        if (*p == '|' || *p == ')')
+            break;
+        term_start = s->byte_code.size;
+        ret = re_parse_term(s, is_backward_dir);
+        if (ret)
+            return ret;
+        if (is_backward_dir) {
+            /* reverse the order of the terms (XXX: inefficient, but
+               speed is not really critical here) */
+            end = s->byte_code.size;
+            term_size = end - term_start;
+            if (dbuf_realloc(&s->byte_code, end + term_size))
+                return -1;
+            memmove(s->byte_code.buf + start + term_size,
+                    s->byte_code.buf + start,
+                    end - start);
+            memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
+                   term_size);
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 
-static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) {
-  int start, len, pos;
+static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
+{
+    int start, len, pos;
 
-  if (lre_check_stack_overflow(s->opaque, 0))
-    return re_parse_error(s, "stack overflow");
+    if (lre_check_stack_overflow(s->opaque, 0))
+        return re_parse_error(s, "stack overflow");
 
-  start = s->byte_code.size;
-  if (re_parse_alternative(s, is_backward_dir))
-    return -1;
-  while (*s->buf_ptr == '|') {
-    s->buf_ptr++;
+    start = s->byte_code.size;
+    if (re_parse_alternative(s, is_backward_dir))
+        return -1;
+    while (*s->buf_ptr == '|') {
+        s->buf_ptr++;
 
-    len = s->byte_code.size - start;
+        len = s->byte_code.size - start;
 
-    /* insert a split before the first alternative */
-    if (dbuf_insert(&s->byte_code, start, 5)) {
-      return re_parse_out_of_memory(s);
-    }
-    s->byte_code.buf[start] = REOP_split_next_first;
-    put_u32(s->byte_code.buf + start + 1, len + 5);
+        /* insert a split before the first alternative */
+        if (dbuf_insert(&s->byte_code, start, 5)) {
+            return re_parse_out_of_memory(s);
+        }
+        s->byte_code.buf[start] = REOP_split_next_first;
+        put_u32(s->byte_code.buf + start + 1, len + 5);
 
-    pos = re_emit_op_u32(s, REOP_goto, 0);
+        pos = re_emit_op_u32(s, REOP_goto, 0);
 
-    if (re_parse_alternative(s, is_backward_dir))
-      return -1;
+        if (re_parse_alternative(s, is_backward_dir))
+            return -1;
 
-    /* patch the goto */
-    len = s->byte_code.size - (pos + 4);
-    put_u32(s->byte_code.buf + pos, len);
-  }
-  return 0;
+        /* patch the goto */
+        len = s->byte_code.size - (pos + 4);
+        put_u32(s->byte_code.buf + pos, len);
+    }
+    return 0;
 }
 
 /* the control flow is recursive so the analysis can be linear */
-static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) {
-  int stack_size, stack_size_max, pos, opcode, len;
-  uint32_t val;
-
-  stack_size = 0;
-  stack_size_max = 0;
-  bc_buf += RE_HEADER_LEN;
-  bc_buf_len -= RE_HEADER_LEN;
-  pos = 0;
-  while (pos < bc_buf_len) {
-    opcode = bc_buf[pos];
-    len = reopcode_info[opcode].size;
-    assert(opcode < REOP_COUNT);
-    assert((pos + len) <= bc_buf_len);
-    switch (opcode) {
-    case REOP_push_i32:
-    case REOP_push_char_pos:
-      stack_size++;
-      if (stack_size > stack_size_max) {
-        if (stack_size > STACK_SIZE_MAX)
-          return -1;
-        stack_size_max = stack_size;
-      }
-      break;
-    case REOP_drop:
-    case REOP_bne_char_pos:
-      assert(stack_size > 0);
-      stack_size--;
-      break;
-    case REOP_range:
-      val = get_u16(bc_buf + pos + 1);
-      len += val * 4;
-      break;
-    case REOP_range32:
-      val = get_u16(bc_buf + pos + 1);
-      len += val * 8;
-      break;
+static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
+{
+    int stack_size, stack_size_max, pos, opcode, len;
+    uint32_t val;
+
+    stack_size = 0;
+    stack_size_max = 0;
+    bc_buf += RE_HEADER_LEN;
+    bc_buf_len -= RE_HEADER_LEN;
+    pos = 0;
+    while (pos < bc_buf_len) {
+        opcode = bc_buf[pos];
+        len = reopcode_info[opcode].size;
+        assert(opcode < REOP_COUNT);
+        assert((pos + len) <= bc_buf_len);
+        switch(opcode) {
+        case REOP_push_i32:
+        case REOP_push_char_pos:
+            stack_size++;
+            if (stack_size > stack_size_max) {
+                if (stack_size > STACK_SIZE_MAX)
+                    return -1;
+                stack_size_max = stack_size;
+            }
+            break;
+        case REOP_drop:
+        case REOP_check_advance:
+            assert(stack_size > 0);
+            stack_size--;
+            break;
+        case REOP_range:
+            val = get_u16(bc_buf + pos + 1);
+            len += val * 4;
+            break;
+        case REOP_range32:
+            val = get_u16(bc_buf + pos + 1);
+            len += val * 8;
+            break;
+        }
+        pos += len;
     }
-    pos += len;
-  }
-  return stack_size_max;
+    return stack_size_max;
 }
 
 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
@@ -1785,750 +1718,784 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) {
 */
 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
                      const char *buf, size_t buf_len, int re_flags,
-                     void *opaque) {
-  REParseState s_s, *s = &s_s;
-  int stack_size;
-  BOOL is_sticky;
-
-  memset(s, 0, sizeof(*s));
-  s->opaque = opaque;
-  s->buf_ptr = (const uint8_t *)buf;
-  s->buf_end = s->buf_ptr + buf_len;
-  s->buf_start = s->buf_ptr;
-  s->re_flags = re_flags;
-  s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
-  is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
-  s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
-  s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
-  s->capture_count = 1;
-  s->total_capture_count = -1;
-  s->has_named_captures = -1;
-
-  dbuf_init2(&s->byte_code, opaque, lre_realloc);
-  dbuf_init2(&s->group_names, opaque, lre_realloc);
-
-  dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
-  dbuf_putc(&s->byte_code, 0);    /* second element is the number of captures */
-  dbuf_putc(&s->byte_code, 0);    /* stack size */
-  dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
-
-  if (!is_sticky) {
-    /* iterate thru all positions (about the same as .*?( ... ) )
-       .  We do it without an explicit loop so that lock step
-       thread execution will be possible in an optimized
-       implementation */
-    re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
-    re_emit_op(s, REOP_any);
-    re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
-  }
-  re_emit_op_u8(s, REOP_save_start, 0);
-
-  if (re_parse_disjunction(s, FALSE)) {
-  error:
-    dbuf_free(&s->byte_code);
-    dbuf_free(&s->group_names);
-    pstrcpy(error_msg, error_msg_size, s->u.error_msg);
-    *plen = 0;
-    return NULL;
-  }
+                     void *opaque)
+{
+    REParseState s_s, *s = &s_s;
+    int stack_size;
+    BOOL is_sticky;
+
+    memset(s, 0, sizeof(*s));
+    s->opaque = opaque;
+    s->buf_ptr = (const uint8_t *)buf;
+    s->buf_end = s->buf_ptr + buf_len;
+    s->buf_start = s->buf_ptr;
+    s->re_flags = re_flags;
+    s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
+    is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
+    s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
+    s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
+    s->capture_count = 1;
+    s->total_capture_count = -1;
+    s->has_named_captures = -1;
+
+    dbuf_init2(&s->byte_code, opaque, lre_realloc);
+    dbuf_init2(&s->group_names, opaque, lre_realloc);
+
+    dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
+    dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
+    dbuf_putc(&s->byte_code, 0); /* stack size */
+    dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
+
+    if (!is_sticky) {
+        /* iterate thru all positions (about the same as .*?( ... ) )
+           .  We do it without an explicit loop so that lock step
+           thread execution will be possible in an optimized
+           implementation */
+        re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
+        re_emit_op(s, REOP_any);
+        re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
+    }
+    re_emit_op_u8(s, REOP_save_start, 0);
+
+    if (re_parse_disjunction(s, FALSE)) {
+    error:
+        dbuf_free(&s->byte_code);
+        dbuf_free(&s->group_names);
+        pstrcpy(error_msg, error_msg_size, s->u.error_msg);
+        *plen = 0;
+        return NULL;
+    }
+
+    re_emit_op_u8(s, REOP_save_end, 0);
 
-  re_emit_op_u8(s, REOP_save_end, 0);
+    re_emit_op(s, REOP_match);
 
-  re_emit_op(s, REOP_match);
+    if (*s->buf_ptr != '\0') {
+        re_parse_error(s, "extraneous characters at the end");
+        goto error;
+    }
 
-  if (*s->buf_ptr != '\0') {
-    re_parse_error(s, "extraneous characters at the end");
-    goto error;
-  }
+    if (dbuf_error(&s->byte_code)) {
+        re_parse_out_of_memory(s);
+        goto error;
+    }
 
-  if (dbuf_error(&s->byte_code)) {
-    re_parse_out_of_memory(s);
-    goto error;
-  }
-
-  stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
-  if (stack_size < 0) {
-    re_parse_error(s, "too many imbricated quantifiers");
-    goto error;
-  }
-
-  s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
-  s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
-  put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
-
-  /* add the named groups if needed */
-  if (s->group_names.size > (s->capture_count - 1)) {
-    dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
-    s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
-  }
-  dbuf_free(&s->group_names);
+    stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
+    if (stack_size < 0) {
+        re_parse_error(s, "too many imbricated quantifiers");
+        goto error;
+    }
+
+    s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
+    s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
+    put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
+            s->byte_code.size - RE_HEADER_LEN);
+
+    /* add the named groups if needed */
+    if (s->group_names.size > (s->capture_count - 1)) {
+        dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
+        s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
+    }
+    dbuf_free(&s->group_names);
 
 #ifdef DUMP_REOP
-  lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
+    lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
 #endif
 
-  error_msg[0] = '\0';
-  *plen = s->byte_code.size;
-  return s->byte_code.buf;
+    error_msg[0] = '\0';
+    *plen = s->byte_code.size;
+    return s->byte_code.buf;
 }
 
-static BOOL is_line_terminator(uint32_t c) {
-  return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
+static BOOL is_line_terminator(uint32_t c)
+{
+    return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
 }
 
-static BOOL is_word_char(uint32_t c) {
-  return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
-          (c >= 'A' && c <= 'Z') || (c == '_'));
+static BOOL is_word_char(uint32_t c)
+{
+    return ((c >= '0' && c <= '9') ||
+            (c >= 'a' && c <= 'z') ||
+            (c >= 'A' && c <= 'Z') ||
+            (c == '_'));
 }
 
-#define GET_CHAR(c, cptr, cbuf_end)                                            \
-  do {                                                                         \
-    if (cbuf_type == 0) {                                                      \
-      c = *cptr++;                                                             \
-    } else {                                                                   \
-      uint32_t __c1;                                                           \
-      c = *(uint16_t *)cptr;                                                   \
-      cptr += 2;                                                               \
-      if (c >= 0xd800 && c < 0xdc00 && cbuf_type == 2 && cptr < cbuf_end) {    \
-        __c1 = *(uint16_t *)cptr;                                              \
-        if (__c1 >= 0xdc00 && __c1 < 0xe000) {                                 \
-          c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000;                \
-          cptr += 2;                                                           \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-#define PEEK_CHAR(c, cptr, cbuf_end)                                           \
-  do {                                                                         \
-    if (cbuf_type == 0) {                                                      \
-      c = cptr[0];                                                             \
-    } else {                                                                   \
-      uint32_t __c1;                                                           \
-      c = ((uint16_t *)cptr)[0];                                               \
-      if (c >= 0xd800 && c < 0xdc00 && cbuf_type == 2 &&                       \
-          (cptr + 2) < cbuf_end) {                                             \
-        __c1 = ((uint16_t *)cptr)[1];                                          \
-        if (__c1 >= 0xdc00 && __c1 < 0xe000) {                                 \
-          c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000;                \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-#define PEEK_PREV_CHAR(c, cptr, cbuf_start)                                    \
-  do {                                                                         \
-    if (cbuf_type == 0) {                                                      \
-      c = cptr[-1];                                                            \
-    } else {                                                                   \
-      uint32_t __c1;                                                           \
-      c = ((uint16_t *)cptr)[-1];                                              \
-      if (c >= 0xdc00 && c < 0xe000 && cbuf_type == 2 &&                       \
-          (cptr - 4) >= cbuf_start) {                                          \
-        __c1 = ((uint16_t *)cptr)[-2];                                         \
-        if (__c1 >= 0xd800 && __c1 < 0xdc00) {                                 \
-          c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000;                \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-#define GET_PREV_CHAR(c, cptr, cbuf_start)                                     \
-  do {                                                                         \
-    if (cbuf_type == 0) {                                                      \
-      cptr--;                                                                  \
-      c = cptr[0];                                                             \
-    } else {                                                                   \
-      uint32_t __c1;                                                           \
-      cptr -= 2;                                                               \
-      c = ((uint16_t *)cptr)[0];                                               \
-      if (c >= 0xdc00 && c < 0xe000 && cbuf_type == 2 && cptr > cbuf_start) {  \
-        __c1 = ((uint16_t *)cptr)[-1];                                         \
-        if (__c1 >= 0xd800 && __c1 < 0xdc00) {                                 \
-          cptr -= 2;                                                           \
-          c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000;                \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-
-#define PREV_CHAR(cptr, cbuf_start)                                            \
-  do {                                                                         \
-    if (cbuf_type == 0) {                                                      \
-      cptr--;                                                                  \
-    } else {                                                                   \
-      cptr -= 2;                                                               \
-      if (cbuf_type == 2) {                                                    \
-        c = ((uint16_t *)cptr)[0];                                             \
-        if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) {                  \
-          c = ((uint16_t *)cptr)[-1];                                          \
-          if (c >= 0xd800 && c < 0xdc00)                                       \
-            cptr -= 2;                                                         \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
+#define GET_CHAR(c, cptr, cbuf_end, cbuf_type)                          \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            c = *cptr++;                                                \
+        } else {                                                        \
+            const uint16_t *_p = (const uint16_t *)cptr;                \
+            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
+            c = *_p++;                                                  \
+            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p < _end && is_lo_surrogate(*_p)) {                \
+                    c = from_surrogate(c, *_p++);                       \
+                }                                                       \
+            }                                                           \
+            cptr = (const void *)_p;                                    \
+        }                                                               \
+    } while (0)
+
+#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type)                         \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            c = cptr[0];                                                \
+        } else {                                                        \
+            const uint16_t *_p = (const uint16_t *)cptr;                \
+            const uint16_t *_end = (const uint16_t *)cbuf_end;          \
+            c = *_p++;                                                  \
+            if (is_hi_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p < _end && is_lo_surrogate(*_p)) {                \
+                    c = from_surrogate(c, *_p);                         \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                  \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            c = cptr[-1];                                               \
+        } else {                                                        \
+            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
+            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
+            c = *_p;                                                    \
+            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
+                    c = from_surrogate(*--_p, c);                       \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+    } while (0)
+
+#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type)                   \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            cptr--;                                                     \
+            c = cptr[0];                                                \
+        } else {                                                        \
+            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
+            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
+            c = *_p;                                                    \
+            if (is_lo_surrogate(c) && cbuf_type == 2) {                 \
+                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
+                    c = from_surrogate(*--_p, c);                       \
+                }                                                       \
+            }                                                           \
+            cptr = (const void *)_p;                                    \
+        }                                                               \
+    } while (0)
+
+#define PREV_CHAR(cptr, cbuf_start, cbuf_type)                          \
+    do {                                                                \
+        if (cbuf_type == 0) {                                           \
+            cptr--;                                                     \
+        } else {                                                        \
+            const uint16_t *_p = (const uint16_t *)cptr - 1;            \
+            const uint16_t *_start = (const uint16_t *)cbuf_start;      \
+            if (is_lo_surrogate(*_p) && cbuf_type == 2) {               \
+                if (_p > _start && is_hi_surrogate(_p[-1])) {           \
+                    --_p;                                               \
+                }                                                       \
+            }                                                           \
+            cptr = (const void *)_p;                                    \
+        }                                                               \
+    } while (0)
 
 typedef uintptr_t StackInt;
 
 typedef enum {
-  RE_EXEC_STATE_SPLIT,
-  RE_EXEC_STATE_LOOKAHEAD,
-  RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
-  RE_EXEC_STATE_GREEDY_QUANT,
+    RE_EXEC_STATE_SPLIT,
+    RE_EXEC_STATE_LOOKAHEAD,
+    RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
+    RE_EXEC_STATE_GREEDY_QUANT,
 } REExecStateEnum;
 
 typedef struct REExecState {
-  REExecStateEnum type : 8;
-  uint8_t stack_len;
-  size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
-  const uint8_t *cptr;
-  const uint8_t *pc;
-  void *buf[0];
+    REExecStateEnum type : 8;
+    uint8_t stack_len;
+    size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
+    const uint8_t *cptr;
+    const uint8_t *pc;
+    void *buf[0];
 } REExecState;
 
 typedef struct {
-  const uint8_t *cbuf;
-  const uint8_t *cbuf_end;
-  /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
-  int cbuf_type;
-  int capture_count;
-  int stack_size_max;
-  BOOL multi_line;
-  BOOL ignore_case;
-  BOOL is_utf16;
-  void *opaque; /* used for stack overflow check */
-
-  size_t state_size;
-  uint8_t *state_stack;
-  size_t state_stack_size;
-  size_t state_stack_len;
+    const uint8_t *cbuf;
+    const uint8_t *cbuf_end;
+    /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
+    int cbuf_type;
+    int capture_count;
+    int stack_size_max;
+    BOOL multi_line;
+    BOOL ignore_case;
+    BOOL is_unicode;
+    void *opaque; /* used for stack overflow check */
+
+    size_t state_size;
+    uint8_t *state_stack;
+    size_t state_stack_size;
+    size_t state_stack_len;
 } REExecContext;
 
-static int push_state(REExecContext *s, uint8_t **capture, StackInt *stack,
-                      size_t stack_len, const uint8_t *pc, const uint8_t *cptr,
-                      REExecStateEnum type, size_t count) {
-  REExecState *rs;
-  uint8_t *new_stack;
-  size_t new_size, i, n;
-  StackInt *stack_buf;
-
-  if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
-    /* reallocate the stack */
-    new_size = s->state_stack_size * 3 / 2;
-    if (new_size < 8)
-      new_size = 8;
-    new_stack =
-        lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
-    if (!new_stack)
-      return -1;
-    s->state_stack_size = new_size;
-    s->state_stack = new_stack;
-  }
-  rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
-  s->state_stack_len++;
-  rs->type = type;
-  rs->count = count;
-  rs->stack_len = stack_len;
-  rs->cptr = cptr;
-  rs->pc = pc;
-  n = 2 * s->capture_count;
-  for (i = 0; i < n; i++)
-    rs->buf[i] = capture[i];
-  stack_buf = (StackInt *)(rs->buf + n);
-  for (i = 0; i < stack_len; i++)
-    stack_buf[i] = stack[i];
-  return 0;
+static int push_state(REExecContext *s,
+                      uint8_t **capture,
+                      StackInt *stack, size_t stack_len,
+                      const uint8_t *pc, const uint8_t *cptr,
+                      REExecStateEnum type, size_t count)
+{
+    REExecState *rs;
+    uint8_t *new_stack;
+    size_t new_size, i, n;
+    StackInt *stack_buf;
+
+    if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
+        /* reallocate the stack */
+        new_size = s->state_stack_size * 3 / 2;
+        if (new_size < 8)
+            new_size = 8;
+        new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
+        if (!new_stack)
+            return -1;
+        s->state_stack_size = new_size;
+        s->state_stack = new_stack;
+    }
+    rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
+    s->state_stack_len++;
+    rs->type = type;
+    rs->count = count;
+    rs->stack_len = stack_len;
+    rs->cptr = cptr;
+    rs->pc = pc;
+    n = 2 * s->capture_count;
+    for(i = 0; i < n; i++)
+        rs->buf[i] = capture[i];
+    stack_buf = (StackInt *)(rs->buf + n);
+    for(i = 0; i < stack_len; i++)
+        stack_buf[i] = stack[i];
+    return 0;
 }
 
 /* return 1 if match, 0 if not match or -1 if error. */
 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                                    StackInt *stack, int stack_len,
                                    const uint8_t *pc, const uint8_t *cptr,
-                                   BOOL no_recurse) {
-  int opcode, ret;
-  int cbuf_type;
-  uint32_t val, c;
-  const uint8_t *cbuf_end;
-
-  cbuf_type = s->cbuf_type;
-  cbuf_end = s->cbuf_end;
-
-  for (;;) {
-    //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf +
-    //        RE_HEADER_LEN)));
-    opcode = *pc++;
-    switch (opcode) {
-    case REOP_match: {
-      REExecState *rs;
-      if (no_recurse)
-        return (intptr_t)cptr;
-      ret = 1;
-      goto recurse;
-    no_match:
-      if (no_recurse)
-        return 0;
-      ret = 0;
-    recurse:
-      for (;;) {
-        if (s->state_stack_len == 0)
-          return ret;
-        rs = (REExecState *)(s->state_stack +
-                             (s->state_stack_len - 1) * s->state_size);
-        if (rs->type == RE_EXEC_STATE_SPLIT) {
-          if (!ret) {
-          pop_state:
-            memcpy(capture, rs->buf, sizeof(capture[0]) * 2 * s->capture_count);
-          pop_state1:
-            pc = rs->pc;
-            cptr = rs->cptr;
-            stack_len = rs->stack_len;
-            memcpy(stack, rs->buf + 2 * s->capture_count,
-                   stack_len * sizeof(stack[0]));
-            s->state_stack_len--;
+                                   BOOL no_recurse)
+{
+    int opcode, ret;
+    int cbuf_type;
+    uint32_t val, c;
+    const uint8_t *cbuf_end;
+
+    cbuf_type = s->cbuf_type;
+    cbuf_end = s->cbuf_end;
+
+    for(;;) {
+        //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
+        opcode = *pc++;
+        switch(opcode) {
+        case REOP_match:
+            {
+                REExecState *rs;
+                if (no_recurse)
+                    return (intptr_t)cptr;
+                ret = 1;
+                goto recurse;
+            no_match:
+                if (no_recurse)
+                    return 0;
+                ret = 0;
+            recurse:
+                for(;;) {
+                    if (s->state_stack_len == 0)
+                        return ret;
+                    rs = (REExecState *)(s->state_stack +
+                                         (s->state_stack_len - 1) * s->state_size);
+                    if (rs->type == RE_EXEC_STATE_SPLIT) {
+                        if (!ret) {
+                        pop_state:
+                            memcpy(capture, rs->buf,
+                                   sizeof(capture[0]) * 2 * s->capture_count);
+                        pop_state1:
+                            pc = rs->pc;
+                            cptr = rs->cptr;
+                            stack_len = rs->stack_len;
+                            memcpy(stack, rs->buf + 2 * s->capture_count,
+                                   stack_len * sizeof(stack[0]));
+                            s->state_stack_len--;
+                            break;
+                        }
+                    } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
+                        if (!ret) {
+                            uint32_t char_count, i;
+                            memcpy(capture, rs->buf,
+                                   sizeof(capture[0]) * 2 * s->capture_count);
+                            stack_len = rs->stack_len;
+                            memcpy(stack, rs->buf + 2 * s->capture_count,
+                                   stack_len * sizeof(stack[0]));
+                            pc = rs->pc;
+                            cptr = rs->cptr;
+                            /* go backward */
+                            char_count = get_u32(pc + 12);
+                            for(i = 0; i < char_count; i++) {
+                                PREV_CHAR(cptr, s->cbuf, cbuf_type);
+                            }
+                            pc = (pc + 16) + (int)get_u32(pc);
+                            rs->cptr = cptr;
+                            rs->count--;
+                            if (rs->count == 0) {
+                                s->state_stack_len--;
+                            }
+                            break;
+                        }
+                    } else {
+                        ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
+                               (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
+                        if (ret) {
+                            /* keep the capture in case of positive lookahead */
+                            if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
+                                goto pop_state1;
+                            else
+                                goto pop_state;
+                        }
+                    }
+                    s->state_stack_len--;
+                }
+            }
+            break;
+        case REOP_char32:
+            val = get_u32(pc);
+            pc += 4;
+            goto test_char;
+        case REOP_char:
+            val = get_u16(pc);
+            pc += 2;
+        test_char:
+            if (cptr >= cbuf_end)
+                goto no_match;
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+            if (s->ignore_case) {
+                c = lre_canonicalize(c, s->is_unicode);
+            }
+            if (val != c)
+                goto no_match;
             break;
-          }
-        } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
-          if (!ret) {
-            uint32_t char_count, i;
-            memcpy(capture, rs->buf, sizeof(capture[0]) * 2 * s->capture_count);
-            stack_len = rs->stack_len;
-            memcpy(stack, rs->buf + 2 * s->capture_count,
-                   stack_len * sizeof(stack[0]));
-            pc = rs->pc;
-            cptr = rs->cptr;
-            /* go backward */
-            char_count = get_u32(pc + 12);
-            for (i = 0; i < char_count; i++) {
-              PREV_CHAR(cptr, s->cbuf);
+        case REOP_split_goto_first:
+        case REOP_split_next_first:
+            {
+                const uint8_t *pc1;
+
+                val = get_u32(pc);
+                pc += 4;
+                if (opcode == REOP_split_next_first) {
+                    pc1 = pc + (int)val;
+                } else {
+                    pc1 = pc;
+                    pc = pc + (int)val;
+                }
+                ret = push_state(s, capture, stack, stack_len,
+                                 pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
+                if (ret < 0)
+                    return -1;
+                break;
             }
-            pc = (pc + 16) + (int)get_u32(pc);
-            rs->cptr = cptr;
-            rs->count--;
-            if (rs->count == 0) {
-              s->state_stack_len--;
+        case REOP_lookahead:
+        case REOP_negative_lookahead:
+            val = get_u32(pc);
+            pc += 4;
+            ret = push_state(s, capture, stack, stack_len,
+                             pc + (int)val, cptr,
+                             RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
+                             0);
+            if (ret < 0)
+                return -1;
+            break;
+
+        case REOP_goto:
+            val = get_u32(pc);
+            pc += 4 + (int)val;
+            break;
+        case REOP_line_start:
+            if (cptr == s->cbuf)
+                break;
+            if (!s->multi_line)
+                goto no_match;
+            PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
+            if (!is_line_terminator(c))
+                goto no_match;
+            break;
+        case REOP_line_end:
+            if (cptr == cbuf_end)
+                break;
+            if (!s->multi_line)
+                goto no_match;
+            PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
+            if (!is_line_terminator(c))
+                goto no_match;
+            break;
+        case REOP_dot:
+            if (cptr == cbuf_end)
+                goto no_match;
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+            if (is_line_terminator(c))
+                goto no_match;
+            break;
+        case REOP_any:
+            if (cptr == cbuf_end)
+                goto no_match;
+            GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+            break;
+        case REOP_save_start:
+        case REOP_save_end:
+            val = *pc++;
+            assert(val < s->capture_count);
+            capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
+            break;
+        case REOP_save_reset:
+            {
+                uint32_t val2;
+                val = pc[0];
+                val2 = pc[1];
+                pc += 2;
+                assert(val2 < s->capture_count);
+                while (val <= val2) {
+                    capture[2 * val] = NULL;
+                    capture[2 * val + 1] = NULL;
+                    val++;
+                }
             }
             break;
-          }
-        } else {
-          ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
-                 (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
-          if (ret) {
-            /* keep the capture in case of positive lookahead */
-            if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
-              goto pop_state1;
-            else
-              goto pop_state;
-          }
-        }
-        s->state_stack_len--;
-      }
-    } break;
-    case REOP_char32:
-      val = get_u32(pc);
-      pc += 4;
-      goto test_char;
-    case REOP_char:
-      val = get_u16(pc);
-      pc += 2;
-    test_char:
-      if (cptr >= cbuf_end)
-        goto no_match;
-      GET_CHAR(c, cptr, cbuf_end);
-      if (s->ignore_case) {
-        c = lre_canonicalize(c, s->is_utf16);
-      }
-      if (val != c)
-        goto no_match;
-      break;
-    case REOP_split_goto_first:
-    case REOP_split_next_first: {
-      const uint8_t *pc1;
-
-      val = get_u32(pc);
-      pc += 4;
-      if (opcode == REOP_split_next_first) {
-        pc1 = pc + (int)val;
-      } else {
-        pc1 = pc;
-        pc = pc + (int)val;
-      }
-      ret = push_state(s, capture, stack, stack_len, pc1, cptr,
-                       RE_EXEC_STATE_SPLIT, 0);
-      if (ret < 0)
-        return -1;
-      break;
-    }
-    case REOP_lookahead:
-    case REOP_negative_lookahead:
-      val = get_u32(pc);
-      pc += 4;
-      ret = push_state(s, capture, stack, stack_len, pc + (int)val, cptr,
-                       RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, 0);
-      if (ret < 0)
-        return -1;
-      break;
-
-    case REOP_goto:
-      val = get_u32(pc);
-      pc += 4 + (int)val;
-      break;
-    case REOP_line_start:
-      if (cptr == s->cbuf)
-        break;
-      if (!s->multi_line)
-        goto no_match;
-      PEEK_PREV_CHAR(c, cptr, s->cbuf);
-      if (!is_line_terminator(c))
-        goto no_match;
-      break;
-    case REOP_line_end:
-      if (cptr == cbuf_end)
-        break;
-      if (!s->multi_line)
-        goto no_match;
-      PEEK_CHAR(c, cptr, cbuf_end);
-      if (!is_line_terminator(c))
-        goto no_match;
-      break;
-    case REOP_dot:
-      if (cptr == cbuf_end)
-        goto no_match;
-      GET_CHAR(c, cptr, cbuf_end);
-      if (is_line_terminator(c))
-        goto no_match;
-      break;
-    case REOP_any:
-      if (cptr == cbuf_end)
-        goto no_match;
-      GET_CHAR(c, cptr, cbuf_end);
-      break;
-    case REOP_save_start:
-    case REOP_save_end:
-      val = *pc++;
-      assert(val < s->capture_count);
-      capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
-      break;
-    case REOP_save_reset: {
-      uint32_t val2;
-      val = pc[0];
-      val2 = pc[1];
-      pc += 2;
-      assert(val2 < s->capture_count);
-      while (val <= val2) {
-        capture[2 * val] = NULL;
-        capture[2 * val + 1] = NULL;
-        val++;
-      }
-    } break;
-    case REOP_push_i32:
-      val = get_u32(pc);
-      pc += 4;
-      stack[stack_len++] = val;
-      break;
-    case REOP_drop:
-      stack_len--;
-      break;
-    case REOP_loop:
-      val = get_u32(pc);
-      pc += 4;
-      if (--stack[stack_len - 1] != 0) {
-        pc += (int)val;
-      }
-      break;
-    case REOP_push_char_pos:
-      stack[stack_len++] = (uintptr_t)cptr;
-      break;
-    case REOP_bne_char_pos:
-      val = get_u32(pc);
-      pc += 4;
-      if (stack[--stack_len] != (uintptr_t)cptr)
-        pc += (int)val;
-      break;
-    case REOP_word_boundary:
-    case REOP_not_word_boundary: {
-      BOOL v1, v2;
-      /* char before */
-      if (cptr == s->cbuf) {
-        v1 = FALSE;
-      } else {
-        PEEK_PREV_CHAR(c, cptr, s->cbuf);
-        v1 = is_word_char(c);
-      }
-      /* current char */
-      if (cptr >= cbuf_end) {
-        v2 = FALSE;
-      } else {
-        PEEK_CHAR(c, cptr, cbuf_end);
-        v2 = is_word_char(c);
-      }
-      if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
-        goto no_match;
-    } break;
-    case REOP_back_reference:
-    case REOP_backward_back_reference: {
-      const uint8_t *cptr1, *cptr1_end, *cptr1_start;
-      uint32_t c1, c2;
-
-      val = *pc++;
-      if (val >= s->capture_count)
-        goto no_match;
-      cptr1_start = capture[2 * val];
-      cptr1_end = capture[2 * val + 1];
-      if (!cptr1_start || !cptr1_end)
-        break;
-      if (opcode == REOP_back_reference) {
-        cptr1 = cptr1_start;
-        while (cptr1 < cptr1_end) {
-          if (cptr >= cbuf_end)
-            goto no_match;
-          GET_CHAR(c1, cptr1, cptr1_end);
-          GET_CHAR(c2, cptr, cbuf_end);
-          if (s->ignore_case) {
-            c1 = lre_canonicalize(c1, s->is_utf16);
-            c2 = lre_canonicalize(c2, s->is_utf16);
-          }
-          if (c1 != c2)
-            goto no_match;
-        }
-      } else {
-        cptr1 = cptr1_end;
-        while (cptr1 > cptr1_start) {
-          if (cptr == s->cbuf)
-            goto no_match;
-          GET_PREV_CHAR(c1, cptr1, cptr1_start);
-          GET_PREV_CHAR(c2, cptr, s->cbuf);
-          if (s->ignore_case) {
-            c1 = lre_canonicalize(c1, s->is_utf16);
-            c2 = lre_canonicalize(c2, s->is_utf16);
-          }
-          if (c1 != c2)
-            goto no_match;
+        case REOP_push_i32:
+            val = get_u32(pc);
+            pc += 4;
+            stack[stack_len++] = val;
+            break;
+        case REOP_drop:
+            stack_len--;
+            break;
+        case REOP_loop:
+            val = get_u32(pc);
+            pc += 4;
+            if (--stack[stack_len - 1] != 0) {
+                pc += (int)val;
+            }
+            break;
+        case REOP_push_char_pos:
+            stack[stack_len++] = (uintptr_t)cptr;
+            break;
+        case REOP_check_advance:
+            if (stack[--stack_len] == (uintptr_t)cptr)
+                goto no_match;
+            break;
+        case REOP_word_boundary:
+        case REOP_not_word_boundary:
+            {
+                BOOL v1, v2;
+                /* char before */
+                if (cptr == s->cbuf) {
+                    v1 = FALSE;
+                } else {
+                    PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
+                    v1 = is_word_char(c);
+                }
+                /* current char */
+                if (cptr >= cbuf_end) {
+                    v2 = FALSE;
+                } else {
+                    PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
+                    v2 = is_word_char(c);
+                }
+                if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
+                    goto no_match;
+            }
+            break;
+        case REOP_back_reference:
+        case REOP_backward_back_reference:
+            {
+                const uint8_t *cptr1, *cptr1_end, *cptr1_start;
+                uint32_t c1, c2;
+
+                val = *pc++;
+                if (val >= s->capture_count)
+                    goto no_match;
+                cptr1_start = capture[2 * val];
+                cptr1_end = capture[2 * val + 1];
+                if (!cptr1_start || !cptr1_end)
+                    break;
+                if (opcode == REOP_back_reference) {
+                    cptr1 = cptr1_start;
+                    while (cptr1 < cptr1_end) {
+                        if (cptr >= cbuf_end)
+                            goto no_match;
+                        GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
+                        GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
+                        if (s->ignore_case) {
+                            c1 = lre_canonicalize(c1, s->is_unicode);
+                            c2 = lre_canonicalize(c2, s->is_unicode);
+                        }
+                        if (c1 != c2)
+                            goto no_match;
+                    }
+                } else {
+                    cptr1 = cptr1_end;
+                    while (cptr1 > cptr1_start) {
+                        if (cptr == s->cbuf)
+                            goto no_match;
+                        GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
+                        GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
+                        if (s->ignore_case) {
+                            c1 = lre_canonicalize(c1, s->is_unicode);
+                            c2 = lre_canonicalize(c2, s->is_unicode);
+                        }
+                        if (c1 != c2)
+                            goto no_match;
+                    }
+                }
+            }
+            break;
+        case REOP_range:
+            {
+                int n;
+                uint32_t low, high, idx_min, idx_max, idx;
+
+                n = get_u16(pc); /* n must be >= 1 */
+                pc += 2;
+                if (cptr >= cbuf_end)
+                    goto no_match;
+                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+                if (s->ignore_case) {
+                    c = lre_canonicalize(c, s->is_unicode);
+                }
+                idx_min = 0;
+                low = get_u16(pc + 0 * 4);
+                if (c < low)
+                    goto no_match;
+                idx_max = n - 1;
+                high = get_u16(pc + idx_max * 4 + 2);
+                /* 0xffff in for last value means +infinity */
+                if (unlikely(c >= 0xffff) && high == 0xffff)
+                    goto range_match;
+                if (c > high)
+                    goto no_match;
+                while (idx_min <= idx_max) {
+                    idx = (idx_min + idx_max) / 2;
+                    low = get_u16(pc + idx * 4);
+                    high = get_u16(pc + idx * 4 + 2);
+                    if (c < low)
+                        idx_max = idx - 1;
+                    else if (c > high)
+                        idx_min = idx + 1;
+                    else
+                        goto range_match;
+                }
+                goto no_match;
+            range_match:
+                pc += 4 * n;
+            }
+            break;
+        case REOP_range32:
+            {
+                int n;
+                uint32_t low, high, idx_min, idx_max, idx;
+
+                n = get_u16(pc); /* n must be >= 1 */
+                pc += 2;
+                if (cptr >= cbuf_end)
+                    goto no_match;
+                GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+                if (s->ignore_case) {
+                    c = lre_canonicalize(c, s->is_unicode);
+                }
+                idx_min = 0;
+                low = get_u32(pc + 0 * 8);
+                if (c < low)
+                    goto no_match;
+                idx_max = n - 1;
+                high = get_u32(pc + idx_max * 8 + 4);
+                if (c > high)
+                    goto no_match;
+                while (idx_min <= idx_max) {
+                    idx = (idx_min + idx_max) / 2;
+                    low = get_u32(pc + idx * 8);
+                    high = get_u32(pc + idx * 8 + 4);
+                    if (c < low)
+                        idx_max = idx - 1;
+                    else if (c > high)
+                        idx_min = idx + 1;
+                    else
+                        goto range32_match;
+                }
+                goto no_match;
+            range32_match:
+                pc += 8 * n;
+            }
+            break;
+        case REOP_prev:
+            /* go to the previous char */
+            if (cptr == s->cbuf)
+                goto no_match;
+            PREV_CHAR(cptr, s->cbuf, cbuf_type);
+            break;
+        case REOP_simple_greedy_quant:
+            {
+                uint32_t next_pos, quant_min, quant_max;
+                size_t q;
+                intptr_t res;
+                const uint8_t *pc1;
+
+                next_pos = get_u32(pc);
+                quant_min = get_u32(pc + 4);
+                quant_max = get_u32(pc + 8);
+                pc += 16;
+                pc1 = pc;
+                pc += (int)next_pos;
+
+                q = 0;
+                for(;;) {
+                    res = lre_exec_backtrack(s, capture, stack, stack_len,
+                                             pc1, cptr, TRUE);
+                    if (res == -1)
+                        return res;
+                    if (!res)
+                        break;
+                    cptr = (uint8_t *)res;
+                    q++;
+                    if (q >= quant_max && quant_max != INT32_MAX)
+                        break;
+                }
+                if (q < quant_min)
+                    goto no_match;
+                if (q > quant_min) {
+                    /* will examine all matches down to quant_min */
+                    ret = push_state(s, capture, stack, stack_len,
+                                     pc1 - 16, cptr,
+                                     RE_EXEC_STATE_GREEDY_QUANT,
+                                     q - quant_min);
+                    if (ret < 0)
+                        return -1;
+                }
+            }
+            break;
+        default:
+            abort();
         }
-      }
-    } break;
-    case REOP_range: {
-      int n;
-      uint32_t low, high, idx_min, idx_max, idx;
-
-      n = get_u16(pc); /* n must be >= 1 */
-      pc += 2;
-      if (cptr >= cbuf_end)
-        goto no_match;
-      GET_CHAR(c, cptr, cbuf_end);
-      if (s->ignore_case) {
-        c = lre_canonicalize(c, s->is_utf16);
-      }
-      idx_min = 0;
-      low = get_u16(pc + 0 * 4);
-      if (c < low)
-        goto no_match;
-      idx_max = n - 1;
-      high = get_u16(pc + idx_max * 4 + 2);
-      /* 0xffff in for last value means +infinity */
-      if (unlikely(c >= 0xffff) && high == 0xffff)
-        goto range_match;
-      if (c > high)
-        goto no_match;
-      while (idx_min <= idx_max) {
-        idx = (idx_min + idx_max) / 2;
-        low = get_u16(pc + idx * 4);
-        high = get_u16(pc + idx * 4 + 2);
-        if (c < low)
-          idx_max = idx - 1;
-        else if (c > high)
-          idx_min = idx + 1;
-        else
-          goto range_match;
-      }
-      goto no_match;
-    range_match:
-      pc += 4 * n;
-    } break;
-    case REOP_range32: {
-      int n;
-      uint32_t low, high, idx_min, idx_max, idx;
-
-      n = get_u16(pc); /* n must be >= 1 */
-      pc += 2;
-      if (cptr >= cbuf_end)
-        goto no_match;
-      GET_CHAR(c, cptr, cbuf_end);
-      if (s->ignore_case) {
-        c = lre_canonicalize(c, s->is_utf16);
-      }
-      idx_min = 0;
-      low = get_u32(pc + 0 * 8);
-      if (c < low)
-        goto no_match;
-      idx_max = n - 1;
-      high = get_u32(pc + idx_max * 8 + 4);
-      if (c > high)
-        goto no_match;
-      while (idx_min <= idx_max) {
-        idx = (idx_min + idx_max) / 2;
-        low = get_u32(pc + idx * 8);
-        high = get_u32(pc + idx * 8 + 4);
-        if (c < low)
-          idx_max = idx - 1;
-        else if (c > high)
-          idx_min = idx + 1;
-        else
-          goto range32_match;
-      }
-      goto no_match;
-    range32_match:
-      pc += 8 * n;
-    } break;
-    case REOP_prev:
-      /* go to the previous char */
-      if (cptr == s->cbuf)
-        goto no_match;
-      PREV_CHAR(cptr, s->cbuf);
-      break;
-    case REOP_simple_greedy_quant: {
-      uint32_t next_pos, quant_min, quant_max;
-      size_t q;
-      intptr_t res;
-      const uint8_t *pc1;
-
-      next_pos = get_u32(pc);
-      quant_min = get_u32(pc + 4);
-      quant_max = get_u32(pc + 8);
-      pc += 16;
-      pc1 = pc;
-      pc += (int)next_pos;
-
-      q = 0;
-      for (;;) {
-        res = lre_exec_backtrack(s, capture, stack, stack_len, pc1, cptr, TRUE);
-        if (res == -1)
-          return res;
-        if (!res)
-          break;
-        cptr = (uint8_t *)res;
-        q++;
-        if (q >= quant_max && quant_max != INT32_MAX)
-          break;
-      }
-      if (q < quant_min)
-        goto no_match;
-      if (q > quant_min) {
-        /* will examine all matches down to quant_min */
-        ret = push_state(s, capture, stack, stack_len, pc1 - 16, cptr,
-                         RE_EXEC_STATE_GREEDY_QUANT, q - quant_min);
-        if (ret < 0)
-          return -1;
-      }
-    } break;
-    default:
-      abort();
     }
-  }
 }
 
 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
    starting position of the match and must be such as 0 <= cindex <=
    clen. */
-int lre_exec(uint8_t **capture, const uint8_t *bc_buf, const uint8_t *cbuf,
-             int cindex, int clen, int cbuf_type, void *opaque) {
-  REExecContext s_s, *s = &s_s;
-  int re_flags, i, alloca_size, ret;
-  StackInt *stack_buf;
-
-  re_flags = bc_buf[RE_HEADER_FLAGS];
-  s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
-  s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
-  s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
-  s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
-  s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
-  s->cbuf = cbuf;
-  s->cbuf_end = cbuf + (clen << cbuf_type);
-  s->cbuf_type = cbuf_type;
-  if (s->cbuf_type == 1 && s->is_utf16)
-    s->cbuf_type = 2;
-  s->opaque = opaque;
-
-  s->state_size = sizeof(REExecState) +
-                  s->capture_count * sizeof(capture[0]) * 2 +
-                  s->stack_size_max * sizeof(stack_buf[0]);
-  s->state_stack = NULL;
-  s->state_stack_len = 0;
-  s->state_stack_size = 0;
-
-  for (i = 0; i < s->capture_count * 2; i++)
-    capture[i] = NULL;
-  alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
-  stack_buf = alloca(alloca_size);
-  ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
-                           cbuf + (cindex << cbuf_type), FALSE);
-  lre_realloc(s->opaque, s->state_stack, 0);
-  return ret;
+int lre_exec(uint8_t **capture,
+             const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
+             int cbuf_type, void *opaque)
+{
+    REExecContext s_s, *s = &s_s;
+    int re_flags, i, alloca_size, ret;
+    StackInt *stack_buf;
+
+    re_flags = lre_get_flags(bc_buf);
+    s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
+    s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
+    s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
+    s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
+    s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
+    s->cbuf = cbuf;
+    s->cbuf_end = cbuf + (clen << cbuf_type);
+    s->cbuf_type = cbuf_type;
+    if (s->cbuf_type == 1 && s->is_unicode)
+        s->cbuf_type = 2;
+    s->opaque = opaque;
+
+    s->state_size = sizeof(REExecState) +
+        s->capture_count * sizeof(capture[0]) * 2 +
+        s->stack_size_max * sizeof(stack_buf[0]);
+    s->state_stack = NULL;
+    s->state_stack_len = 0;
+    s->state_stack_size = 0;
+
+    for(i = 0; i < s->capture_count * 2; i++)
+        capture[i] = NULL;
+    alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
+    stack_buf = alloca(alloca_size);
+    ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
+                             cbuf + (cindex << cbuf_type), FALSE);
+    lre_realloc(s->opaque, s->state_stack, 0);
+    return ret;
 }
 
-int lre_get_capture_count(const uint8_t *bc_buf) {
-  return bc_buf[RE_HEADER_CAPTURE_COUNT];
+int lre_get_capture_count(const uint8_t *bc_buf)
+{
+    return bc_buf[RE_HEADER_CAPTURE_COUNT];
 }
 
-int lre_get_flags(const uint8_t *bc_buf) { return bc_buf[RE_HEADER_FLAGS]; }
+int lre_get_flags(const uint8_t *bc_buf)
+{
+    return bc_buf[RE_HEADER_FLAGS];
+}
 
 /* Return NULL if no group names. Otherwise, return a pointer to
    'capture_count - 1' zero terminated UTF-8 strings. */
-const char *lre_get_groupnames(const uint8_t *bc_buf) {
-  uint32_t re_bytecode_len;
-  if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
-    return NULL;
-  re_bytecode_len = get_u32(bc_buf + 3);
-  return (const char *)(bc_buf + 7 + re_bytecode_len);
+const char *lre_get_groupnames(const uint8_t *bc_buf)
+{
+    uint32_t re_bytecode_len;
+    if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
+        return NULL;
+    re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
+    return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
 }
 
-BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) {
-  return FALSE;
+#ifdef TEST
+
+BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
+{
+    return FALSE;
 }
 
-void *lre_realloc(void *opaque, void *ptr, size_t size) {
-  return realloc(ptr, size);
+void *lre_realloc(void *opaque, void *ptr, size_t size)
+{
+    return realloc(ptr, size);
 }
 
-#ifdef TEST
+int main(int argc, char **argv)
+{
+    int len, flags, ret, i;
+    uint8_t *bc;
+    char error_msg[64];
+    uint8_t *capture[CAPTURE_COUNT_MAX * 2];
+    const char *input;
+    int input_len, capture_count;
+
+    if (argc < 4) {
+        printf("usage: %s regexp flags input\n", argv[0]);
+        return 1;
+    }
+    flags = atoi(argv[2]);
+    bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
+                     strlen(argv[1]), flags, NULL);
+    if (!bc) {
+        fprintf(stderr, "error: %s\n", error_msg);
+        exit(1);
+    }
 
-int main(int argc, char **argv) {
-  int len, ret, i;
-  uint8_t *bc;
-  char error_msg[64];
-  uint8_t *capture[CAPTURE_COUNT_MAX * 2];
-  const char *input;
-  int input_len, capture_count;
-
-  if (argc < 3) {
-    printf("usage: %s regexp input\n", argv[0]);
-    exit(1);
-  }
-  bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], strlen(argv[1]),
-                   0, NULL);
-  if (!bc) {
-    fprintf(stderr, "error: %s\n", error_msg);
-    exit(1);
-  }
-
-  input = argv[2];
-  input_len = strlen(input);
-
-  ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
-  printf("ret=%d\n", ret);
-  if (ret == 1) {
-    capture_count = lre_get_capture_count(bc);
-    for (i = 0; i < 2 * capture_count; i++) {
-      uint8_t *ptr;
-      ptr = capture[i];
-      printf("%d: ", i);
-      if (!ptr)
-        printf("<nil>");
-      else
-        printf("%u", (int)(ptr - (uint8_t *)input));
-      printf("\n");
+    input = argv[3];
+    input_len = strlen(input);
+
+    ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
+    printf("ret=%d\n", ret);
+    if (ret == 1) {
+        capture_count = lre_get_capture_count(bc);
+        for(i = 0; i < 2 * capture_count; i++) {
+            uint8_t *ptr;
+            ptr = capture[i];
+            printf("%d: ", i);
+            if (!ptr)
+                printf("<nil>");
+            else
+                printf("%u", (int)(ptr - (uint8_t *)input));
+            printf("\n");
+        }
     }
-  }
-  return 0;
+    return 0;
 }
 #endif
diff --git a/libregexp/libregexp.h b/libregexp/libregexp.h
index 9aedb7e..7af7ece 100644
--- a/libregexp/libregexp.h
+++ b/libregexp/libregexp.h
@@ -1,6 +1,6 @@
 /*
  * Regular Expression Engine
- * 
+ *
  * Copyright (c) 2017-2018 Fabrice Bellard
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -25,18 +25,15 @@
 #define LIBREGEXP_H
 
 #include <stddef.h>
-
-#include "libunicode.h"
-
-#define LRE_BOOL  int       /* for documentation purposes */
+#include <stdint.h>
 
 #define LRE_FLAG_GLOBAL     (1 << 0)
 #define LRE_FLAG_IGNORECASE (1 << 1)
 #define LRE_FLAG_MULTILINE  (1 << 2)
 #define LRE_FLAG_DOTALL     (1 << 3)
-#define LRE_FLAG_UTF16      (1 << 4)
+#define LRE_FLAG_UNICODE    (1 << 4)
 #define LRE_FLAG_STICKY     (1 << 5)
-
+#define LRE_FLAG_INDICES    (1 << 6) /* Unused by libregexp, just recorded. */
 #define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
 
 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
@@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture,
              int cbuf_type, void *opaque);
 
 int lre_parse_escape(const uint8_t **pp, int allow_utf16);
-LRE_BOOL lre_is_space(int c);
 
-/* must be provided by the user */
-LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size); 
+/* must be provided by the user, return non zero if overflow */
+int lre_check_stack_overflow(void *opaque, size_t alloca_size);
 void *lre_realloc(void *opaque, void *ptr, size_t size);
 
-/* JS identifier test */
-extern uint32_t const lre_id_start_table_ascii[4];
-extern uint32_t const lre_id_continue_table_ascii[4];
-
-static inline int lre_js_is_ident_first(int c)
-{
-    if ((uint32_t)c < 128) {
-        return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1;
-    } else {
-#ifdef CONFIG_ALL_UNICODE
-        return lre_is_id_start(c);
-#else
-        return !lre_is_space(c);
-#endif
-    }
-}
-
-static inline int lre_js_is_ident_next(int c)
-{
-    if ((uint32_t)c < 128) {
-        return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1;
-    } else {
-        /* ZWNJ and ZWJ are accepted in identifiers */
-#ifdef CONFIG_ALL_UNICODE
-        return lre_is_id_continue(c) || c == 0x200C || c == 0x200D;
-#else
-        return !lre_is_space(c) || c == 0x200C || c == 0x200D;
-#endif
-    }
-}
-
-#undef LRE_BOOL
-
 #endif /* LIBREGEXP_H */
diff --git a/libregexp/libunicode-table.h b/libregexp/libunicode-table.h
index b64178b..72d495e 100644
--- a/libregexp/libunicode-table.h
+++ b/libregexp/libunicode-table.h
@@ -189,9 +189,13 @@ static const uint8_t unicode_prop_Cased1_table[196] = {
 };
 
 static const uint8_t unicode_prop_Cased1_index[21] = {
-    0xb9, 0x02, 0xe0, 0xc0, 0x1d, 0x20, 0xe5, 0x2c,
-    0x20, 0xb1, 0x07, 0x21, 0xc1, 0xd6, 0x21, 0x4a,
-    0xf1, 0x01, 0x8a, 0xf1, 0x01,
+    0xb9, 0x02, 0xe0,  //  002B9 at 39
+    0xc0, 0x1d, 0x20,  //  01DC0 at 65
+    0xe5, 0x2c, 0x20,  //  02CE5 at 97
+    0xb1, 0x07, 0x21,  //  107B1 at 129
+    0xc1, 0xd6, 0x21,  //  1D6C1 at 161
+    0x4a, 0xf1, 0x01,  //  1F14A at 192
+    0x8a, 0xf1, 0x01,  //  1F18A at 224 (upper bound)
 };
 
 static const uint8_t unicode_prop_Case_Ignorable_table[737] = {
@@ -291,15 +295,29 @@ static const uint8_t unicode_prop_Case_Ignorable_table[737] = {
 };
 
 static const uint8_t unicode_prop_Case_Ignorable_index[69] = {
-    0xbe, 0x05, 0x00, 0xfe, 0x07, 0x00, 0x52, 0x0a,
-    0xa0, 0xc1, 0x0b, 0x00, 0x82, 0x0d, 0x00, 0x3f,
-    0x10, 0x80, 0xd4, 0x17, 0x40, 0xcf, 0x1a, 0x20,
-    0xf5, 0x1c, 0x00, 0x80, 0x20, 0x00, 0x16, 0xa0,
-    0x00, 0xc6, 0xa8, 0x00, 0xc2, 0xaa, 0x60, 0x56,
-    0xfe, 0x20, 0xb1, 0x07, 0x01, 0x75, 0x10, 0x01,
-    0xeb, 0x12, 0x21, 0x41, 0x16, 0x01, 0x5c, 0x1a,
-    0x01, 0x43, 0x1f, 0x01, 0x2e, 0xcf, 0x41, 0x25,
-    0xe0, 0x01, 0xf0, 0x01, 0x0e,
+    0xbe, 0x05, 0x00,  //  005BE at 32
+    0xfe, 0x07, 0x00,  //  007FE at 64
+    0x52, 0x0a, 0xa0,  //  00A52 at 101
+    0xc1, 0x0b, 0x00,  //  00BC1 at 128
+    0x82, 0x0d, 0x00,  //  00D82 at 160
+    0x3f, 0x10, 0x80,  //  0103F at 196
+    0xd4, 0x17, 0x40,  //  017D4 at 226
+    0xcf, 0x1a, 0x20,  //  01ACF at 257
+    0xf5, 0x1c, 0x00,  //  01CF5 at 288
+    0x80, 0x20, 0x00,  //  02080 at 320
+    0x16, 0xa0, 0x00,  //  0A016 at 352
+    0xc6, 0xa8, 0x00,  //  0A8C6 at 384
+    0xc2, 0xaa, 0x60,  //  0AAC2 at 419
+    0x56, 0xfe, 0x20,  //  0FE56 at 449
+    0xb1, 0x07, 0x01,  //  107B1 at 480
+    0x75, 0x10, 0x01,  //  11075 at 512
+    0xeb, 0x12, 0x21,  //  112EB at 545
+    0x41, 0x16, 0x01,  //  11641 at 576
+    0x5c, 0x1a, 0x01,  //  11A5C at 608
+    0x43, 0x1f, 0x01,  //  11F43 at 640
+    0x2e, 0xcf, 0x41,  //  1CF2E at 674
+    0x25, 0xe0, 0x01,  //  1E025 at 704
+    0xf0, 0x01, 0x0e,  //  E01F0 at 736 (upper bound)
 };
 
 static const uint8_t unicode_prop_ID_Start_table[1100] = {
@@ -444,20 +462,41 @@ static const uint8_t unicode_prop_ID_Start_table[1100] = {
 };
 
 static const uint8_t unicode_prop_ID_Start_index[105] = {
-    0xf6, 0x03, 0x20, 0xa6, 0x07, 0x00, 0xa9, 0x09,
-    0x20, 0xb1, 0x0a, 0x00, 0xba, 0x0b, 0x20, 0x3b,
-    0x0d, 0x20, 0xc7, 0x0e, 0x20, 0x49, 0x12, 0x00,
-    0x9b, 0x16, 0x00, 0xac, 0x19, 0x00, 0xc0, 0x1d,
-    0x80, 0x80, 0x20, 0x20, 0x70, 0x2d, 0x00, 0x00,
-    0x32, 0x00, 0xda, 0xa7, 0x00, 0x4c, 0xaa, 0x20,
-    0xc7, 0xd7, 0x20, 0xfc, 0xfd, 0x20, 0x9d, 0x02,
-    0x21, 0x96, 0x05, 0x01, 0xf3, 0x08, 0x01, 0xb3,
-    0x0c, 0x21, 0x73, 0x11, 0x61, 0x34, 0x13, 0x01,
-    0x1b, 0x17, 0x21, 0x8a, 0x1a, 0x01, 0x34, 0x1f,
-    0x21, 0xbf, 0x6a, 0x01, 0x23, 0xb1, 0xa1, 0xad,
-    0xd4, 0x01, 0x6f, 0xd7, 0x01, 0xff, 0xe7, 0x61,
-    0x5e, 0xee, 0x01, 0xe1, 0xeb, 0x22, 0xb0, 0x23,
-    0x03,
+    0xf6, 0x03, 0x20,  //  003F6 at 33
+    0xa6, 0x07, 0x00,  //  007A6 at 64
+    0xa9, 0x09, 0x20,  //  009A9 at 97
+    0xb1, 0x0a, 0x00,  //  00AB1 at 128
+    0xba, 0x0b, 0x20,  //  00BBA at 161
+    0x3b, 0x0d, 0x20,  //  00D3B at 193
+    0xc7, 0x0e, 0x20,  //  00EC7 at 225
+    0x49, 0x12, 0x00,  //  01249 at 256
+    0x9b, 0x16, 0x00,  //  0169B at 288
+    0xac, 0x19, 0x00,  //  019AC at 320
+    0xc0, 0x1d, 0x80,  //  01DC0 at 356
+    0x80, 0x20, 0x20,  //  02080 at 385
+    0x70, 0x2d, 0x00,  //  02D70 at 416
+    0x00, 0x32, 0x00,  //  03200 at 448
+    0xda, 0xa7, 0x00,  //  0A7DA at 480
+    0x4c, 0xaa, 0x20,  //  0AA4C at 513
+    0xc7, 0xd7, 0x20,  //  0D7C7 at 545
+    0xfc, 0xfd, 0x20,  //  0FDFC at 577
+    0x9d, 0x02, 0x21,  //  1029D at 609
+    0x96, 0x05, 0x01,  //  10596 at 640
+    0xf3, 0x08, 0x01,  //  108F3 at 672
+    0xb3, 0x0c, 0x21,  //  10CB3 at 705
+    0x73, 0x11, 0x61,  //  11173 at 739
+    0x34, 0x13, 0x01,  //  11334 at 768
+    0x1b, 0x17, 0x21,  //  1171B at 801
+    0x8a, 0x1a, 0x01,  //  11A8A at 832
+    0x34, 0x1f, 0x21,  //  11F34 at 865
+    0xbf, 0x6a, 0x01,  //  16ABF at 896
+    0x23, 0xb1, 0xa1,  //  1B123 at 933
+    0xad, 0xd4, 0x01,  //  1D4AD at 960
+    0x6f, 0xd7, 0x01,  //  1D76F at 992
+    0xff, 0xe7, 0x61,  //  1E7FF at 1027
+    0x5e, 0xee, 0x01,  //  1EE5E at 1056
+    0xe1, 0xeb, 0x22,  //  2EBE1 at 1089
+    0xb0, 0x23, 0x03,  //  323B0 at 1120 (upper bound)
 };
 
 static const uint8_t unicode_prop_ID_Continue1_table[660] = {
@@ -547,14 +586,27 @@ static const uint8_t unicode_prop_ID_Continue1_table[660] = {
 };
 
 static const uint8_t unicode_prop_ID_Continue1_index[63] = {
-    0xfa, 0x06, 0x00, 0x70, 0x09, 0x00, 0xf0, 0x0a,
-    0x40, 0x57, 0x0c, 0x00, 0xf0, 0x0d, 0x60, 0xc7,
-    0x0f, 0x20, 0xea, 0x17, 0x40, 0x05, 0x1b, 0x00,
-    0x41, 0x20, 0x00, 0x0c, 0xa8, 0x80, 0x37, 0xaa,
-    0x20, 0x50, 0xfe, 0x20, 0x3a, 0x0d, 0x21, 0x74,
-    0x11, 0x01, 0x5a, 0x14, 0x21, 0x44, 0x19, 0x81,
-    0x5a, 0x1d, 0xa1, 0xf5, 0x6a, 0x21, 0x45, 0xd2,
-    0x41, 0xaf, 0xe2, 0x21, 0xf0, 0x01, 0x0e,
+    0xfa, 0x06, 0x00,  //  006FA at 32
+    0x70, 0x09, 0x00,  //  00970 at 64
+    0xf0, 0x0a, 0x40,  //  00AF0 at 98
+    0x57, 0x0c, 0x00,  //  00C57 at 128
+    0xf0, 0x0d, 0x60,  //  00DF0 at 163
+    0xc7, 0x0f, 0x20,  //  00FC7 at 193
+    0xea, 0x17, 0x40,  //  017EA at 226
+    0x05, 0x1b, 0x00,  //  01B05 at 256
+    0x41, 0x20, 0x00,  //  02041 at 288
+    0x0c, 0xa8, 0x80,  //  0A80C at 324
+    0x37, 0xaa, 0x20,  //  0AA37 at 353
+    0x50, 0xfe, 0x20,  //  0FE50 at 385
+    0x3a, 0x0d, 0x21,  //  10D3A at 417
+    0x74, 0x11, 0x01,  //  11174 at 448
+    0x5a, 0x14, 0x21,  //  1145A at 481
+    0x44, 0x19, 0x81,  //  11944 at 516
+    0x5a, 0x1d, 0xa1,  //  11D5A at 549
+    0xf5, 0x6a, 0x21,  //  16AF5 at 577
+    0x45, 0xd2, 0x41,  //  1D245 at 610
+    0xaf, 0xe2, 0x21,  //  1E2AF at 641
+    0xf0, 0x01, 0x0e,  //  E01F0 at 672 (upper bound)
 };
 
 #ifdef CONFIG_ALL_UNICODE
@@ -676,17 +728,35 @@ static const uint8_t unicode_cc_table[899] = {
 };
 
 static const uint8_t unicode_cc_index[87] = {
-    0x4d, 0x03, 0x00, 0x97, 0x05, 0x20, 0xc6, 0x05,
-    0x00, 0xe7, 0x06, 0x00, 0x45, 0x07, 0x00, 0x9c,
-    0x08, 0x00, 0x4d, 0x09, 0x00, 0x3c, 0x0b, 0x00,
-    0x3d, 0x0d, 0x00, 0x36, 0x0f, 0x00, 0x38, 0x10,
-    0x20, 0x3a, 0x19, 0x00, 0xcb, 0x1a, 0x20, 0xd3,
-    0x1c, 0x00, 0xcf, 0x1d, 0x00, 0xe2, 0x20, 0x00,
-    0x2e, 0x30, 0x20, 0x2b, 0xa9, 0x20, 0xed, 0xab,
-    0x00, 0x39, 0x0a, 0x01, 0x51, 0x0f, 0x01, 0x73,
-    0x11, 0x01, 0x75, 0x13, 0x01, 0x2b, 0x17, 0x21,
-    0x3f, 0x1c, 0x21, 0x9e, 0xbc, 0x21, 0x08, 0xe0,
-    0x01, 0x44, 0xe9, 0x01, 0x4b, 0xe9, 0x01,
+    0x4d, 0x03, 0x00,  //  0034D at 32
+    0x97, 0x05, 0x20,  //  00597 at 65
+    0xc6, 0x05, 0x00,  //  005C6 at 96
+    0xe7, 0x06, 0x00,  //  006E7 at 128
+    0x45, 0x07, 0x00,  //  00745 at 160
+    0x9c, 0x08, 0x00,  //  0089C at 192
+    0x4d, 0x09, 0x00,  //  0094D at 224
+    0x3c, 0x0b, 0x00,  //  00B3C at 256
+    0x3d, 0x0d, 0x00,  //  00D3D at 288
+    0x36, 0x0f, 0x00,  //  00F36 at 320
+    0x38, 0x10, 0x20,  //  01038 at 353
+    0x3a, 0x19, 0x00,  //  0193A at 384
+    0xcb, 0x1a, 0x20,  //  01ACB at 417
+    0xd3, 0x1c, 0x00,  //  01CD3 at 448
+    0xcf, 0x1d, 0x00,  //  01DCF at 480
+    0xe2, 0x20, 0x00,  //  020E2 at 512
+    0x2e, 0x30, 0x20,  //  0302E at 545
+    0x2b, 0xa9, 0x20,  //  0A92B at 577
+    0xed, 0xab, 0x00,  //  0ABED at 608
+    0x39, 0x0a, 0x01,  //  10A39 at 640
+    0x51, 0x0f, 0x01,  //  10F51 at 672
+    0x73, 0x11, 0x01,  //  11173 at 704
+    0x75, 0x13, 0x01,  //  11375 at 736
+    0x2b, 0x17, 0x21,  //  1172B at 769
+    0x3f, 0x1c, 0x21,  //  11C3F at 801
+    0x9e, 0xbc, 0x21,  //  1BC9E at 833
+    0x08, 0xe0, 0x01,  //  1E008 at 864
+    0x44, 0xe9, 0x01,  //  1E944 at 896
+    0x4b, 0xe9, 0x01,  //  1E94B at 928 (upper bound)
 };
 
 static const uint32_t unicode_decomp_table1[699] = {
@@ -3779,72 +3849,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = {
     0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80,
 };
 
-static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = {
-    0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44,
-    0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f,
-    0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80,
-    0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b,
-    0x84,
+static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = {
+    0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80,
+    0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00,
+    0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81,
+    0x89, 0x10, 0x81, 0x8d, 0x80,
 };
 
-static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = {
+static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = {
     0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12,
-    0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b,
-    0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80,
-    0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08,
-    0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08,
-    0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80,
-    0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87,
-    0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11,
-    0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe,
-    0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88,
-    0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00,
-    0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03,
-    0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81,
-    0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10,
-    0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1,
-    0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87,
-    0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80,
-    0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08,
-    0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b,
-    0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a,
-    0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a,
-    0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06,
-    0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80,
-    0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41,
-    0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6,
-    0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0,
-    0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40,
-    0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09,
-    0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf,
-    0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f,
-    0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40,
-    0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80,
-    0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81,
-    0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9,
-    0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00,
-    0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91,
-    0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95,
-    0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00,
-    0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40,
-    0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80,
-    0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60,
-    0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87,
-    0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01,
-    0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80,
-    0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04,
-    0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23,
-    0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18,
-    0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00,
-    0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b,
-    0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90,
-    0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84,
-    0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee,
-    0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d,
-    0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05,
-    0xe1, 0x4f, 0xff,
+    0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84,
+    0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb,
+    0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89,
+    0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80,
+    0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28,
+    0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08,
+    0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40,
+    0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7,
+    0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03,
+    0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80,
+    0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b,
+    0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52,
+    0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80,
+    0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40,
+    0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80,
+    0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+    0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c,
+    0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f,
+    0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00,
+    0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80,
+    0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01,
+    0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05,
+    0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40,
+    0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34,
+    0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82,
+    0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80,
+    0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5,
+    0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80,
+    0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e,
+    0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60,
+    0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80,
+    0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60,
+    0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89,
+    0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2,
+    0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb,
+    0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7,
+    0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80,
+    0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08,
+    0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d,
+    0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20,
+    0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54,
+    0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e,
+    0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80,
+    0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06,
+    0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41,
+    0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f,
+    0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08,
+    0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00,
+    0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00,
+    0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90,
+    0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99,
+    0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83,
+    0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05,
+    0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff,
 };
 
 static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = {
@@ -4486,3 +4554,4 @@ static const uint16_t unicode_prop_len_table[] = {
 };
 
 #endif /* CONFIG_ALL_UNICODE */
+/* 62 tables / 32261 bytes, 5 index / 345 bytes */
diff --git a/libregexp/libunicode.c b/libregexp/libunicode.c
index 63c12a0..c80d2f3 100644
--- a/libregexp/libunicode.c
+++ b/libregexp/libunicode.c
@@ -1,6 +1,6 @@
 /*
  * Unicode utilities
- * 
+ *
  * Copyright (c) 2017-2018 Fabrice Bellard
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -43,15 +43,115 @@ enum {
     RUN_TYPE_UF_D1_EXT,
     RUN_TYPE_U_EXT,
     RUN_TYPE_LF_EXT,
-    RUN_TYPE_U_EXT2,
-    RUN_TYPE_L_EXT2,
-    RUN_TYPE_U_EXT3,
+    RUN_TYPE_UF_EXT2,
+    RUN_TYPE_LF_EXT2,
+    RUN_TYPE_UF_EXT3,
 };
 
+static int lre_case_conv1(uint32_t c, int conv_type)
+{
+    uint32_t res[LRE_CC_RES_LEN_MAX];
+    lre_case_conv(res, c, conv_type);
+    return res[0];
+}
+
+/* case conversion using the table entry 'idx' with value 'v' */
+static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
+{
+    uint32_t code, data, type, a, is_lower;
+    is_lower = (conv_type != 0);
+    type = (v >> (32 - 17 - 7 - 4)) & 0xf;
+    data = ((v & 0xf) << 8) | case_conv_table2[idx];
+    code = v >> (32 - 17);
+    switch(type) {
+    case RUN_TYPE_U:
+    case RUN_TYPE_L:
+    case RUN_TYPE_UF:
+    case RUN_TYPE_LF:
+        if (conv_type == (type & 1) ||
+            (type >= RUN_TYPE_UF && conv_type == 2)) {
+            c = c - code + (case_conv_table1[data] >> (32 - 17));
+        }
+        break;
+    case RUN_TYPE_UL:
+        a = c - code;
+        if ((a & 1) != (1 - is_lower))
+            break;
+        c = (a ^ 1) + code;
+        break;
+    case RUN_TYPE_LSU:
+        a = c - code;
+        if (a == 1) {
+            c += 2 * is_lower - 1;
+        } else if (a == (1 - is_lower) * 2) {
+            c += (2 * is_lower - 1) * 2;
+        }
+        break;
+    case RUN_TYPE_U2L_399_EXT2:
+        if (!is_lower) {
+            res[0] = c - code + case_conv_ext[data >> 6];
+            res[1] = 0x399;
+            return 2;
+        } else {
+            c = c - code + case_conv_ext[data & 0x3f];
+        }
+        break;
+    case RUN_TYPE_UF_D20:
+        if (conv_type == 1)
+            break;
+        c = data + (conv_type == 2) * 0x20;
+        break;
+    case RUN_TYPE_UF_D1_EXT:
+        if (conv_type == 1)
+            break;
+        c = case_conv_ext[data] + (conv_type == 2);
+        break;
+    case RUN_TYPE_U_EXT:
+    case RUN_TYPE_LF_EXT:
+        if (is_lower != (type - RUN_TYPE_U_EXT))
+            break;
+        c = case_conv_ext[data];
+        break;
+    case RUN_TYPE_LF_EXT2:
+        if (!is_lower)
+            break;
+        res[0] = c - code + case_conv_ext[data >> 6];
+        res[1] = case_conv_ext[data & 0x3f];
+        return 2;
+    case RUN_TYPE_UF_EXT2:
+        if (conv_type == 1)
+            break;
+        res[0] = c - code + case_conv_ext[data >> 6];
+        res[1] = case_conv_ext[data & 0x3f];
+        if (conv_type == 2) {
+            /* convert to lower */
+            res[0] = lre_case_conv1(res[0], 1);
+            res[1] = lre_case_conv1(res[1], 1);
+        }
+        return 2;
+    default:
+    case RUN_TYPE_UF_EXT3:
+        if (conv_type == 1)
+            break;
+        res[0] = case_conv_ext[data >> 8];
+        res[1] = case_conv_ext[(data >> 4) & 0xf];
+        res[2] = case_conv_ext[data & 0xf];
+        if (conv_type == 2) {
+            /* convert to lower */
+            res[0] = lre_case_conv1(res[0], 1);
+            res[1] = lre_case_conv1(res[1], 1);
+            res[2] = lre_case_conv1(res[2], 1);
+        }
+        return 3;
+    }
+    res[0] = c;
+    return 1;
+}
+
 /* conv_type:
-   0 = to upper 
+   0 = to upper
    1 = to lower
-   2 = case folding (= to lower with modifications) 
+   2 = case folding (= to lower with modifications)
 */
 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
 {
@@ -66,10 +166,9 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
             }
         }
     } else {
-        uint32_t v, code, data, type, len, a, is_lower;
+        uint32_t v, code, len;
         int idx, idx_min, idx_max;
-        
-        is_lower = (conv_type != 0);
+
         idx_min = 0;
         idx_max = countof(case_conv_table1) - 1;
         while (idx_min <= idx_max) {
@@ -82,74 +181,7 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
             } else if (c >= code + len) {
                 idx_min = idx + 1;
             } else {
-                type = (v >> (32 - 17 - 7 - 4)) & 0xf;
-                data = ((v & 0xf) << 8) | case_conv_table2[idx];
-                switch(type) {
-                case RUN_TYPE_U:
-                case RUN_TYPE_L:
-                case RUN_TYPE_UF:
-                case RUN_TYPE_LF:
-                    if (conv_type == (type & 1) ||
-                        (type >= RUN_TYPE_UF && conv_type == 2)) {
-                        c = c - code + (case_conv_table1[data] >> (32 - 17));
-                    }
-                    break;
-                case RUN_TYPE_UL:
-                    a = c - code;
-                    if ((a & 1) != (1 - is_lower))
-                        break;
-                    c = (a ^ 1) + code;
-                    break;
-                case RUN_TYPE_LSU:
-                    a = c - code;
-                    if (a == 1) {
-                        c += 2 * is_lower - 1;
-                    } else if (a == (1 - is_lower) * 2) {
-                        c += (2 * is_lower - 1) * 2;
-                    }
-                    break;
-                case RUN_TYPE_U2L_399_EXT2:
-                    if (!is_lower) {
-                        res[0] = c - code + case_conv_ext[data >> 6];
-                        res[1] = 0x399;
-                        return 2;
-                    } else {
-                        c = c - code + case_conv_ext[data & 0x3f];
-                    }
-                    break;
-                case RUN_TYPE_UF_D20:
-                    if (conv_type == 1)
-                        break;
-                    c = data + (conv_type == 2) * 0x20;
-                    break;
-                case RUN_TYPE_UF_D1_EXT:
-                    if (conv_type == 1)
-                        break;
-                    c = case_conv_ext[data] + (conv_type == 2);
-                    break;
-                case RUN_TYPE_U_EXT:
-                case RUN_TYPE_LF_EXT:
-                    if (is_lower != (type - RUN_TYPE_U_EXT))
-                        break;
-                    c = case_conv_ext[data];
-                    break;
-                case RUN_TYPE_U_EXT2:
-                case RUN_TYPE_L_EXT2:
-                    if (conv_type != (type - RUN_TYPE_U_EXT2))
-                        break;
-                    res[0] = c - code + case_conv_ext[data >> 6];
-                    res[1] = case_conv_ext[data & 0x3f];
-                    return 2;
-                default:
-                case RUN_TYPE_U_EXT3:
-                    if (conv_type != 0)
-                        break;
-                    res[0] = case_conv_ext[data >> 8];
-                    res[1] = case_conv_ext[(data >> 4) & 0xf];
-                    res[2] = case_conv_ext[data & 0xf];
-                    return 3;
-                }
-                break;
+                return lre_case_conv_entry(res, c, conv_type, idx, v);
             }
         }
     }
@@ -157,13 +189,80 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
     return 1;
 }
 
+static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
+{
+    uint32_t res[LRE_CC_RES_LEN_MAX];
+    int len;
+
+    if (is_unicode) {
+        len = lre_case_conv_entry(res, c, 2, idx, v);
+        if (len == 1) {
+            c = res[0];
+        } else {
+            /* handle the few specific multi-character cases (see
+               unicode_gen.c:dump_case_folding_special_cases()) */
+            if (c == 0xfb06) {
+                c = 0xfb05;
+            } else if (c == 0x01fd3) {
+                c = 0x390;
+            } else if (c == 0x01fe3) {
+                c = 0x3b0;
+            }
+        }
+    } else {
+        if (likely(c < 128)) {
+            if (c >= 'a' && c <= 'z')
+                c = c - 'a' + 'A';
+        } else {
+            /* legacy regexp: to upper case if single char >= 128 */
+            len = lre_case_conv_entry(res, c, FALSE, idx, v);
+            if (len == 1 && res[0] >= 128)
+                c = res[0];
+        }
+    }
+    return c;
+}
+
+/* JS regexp specific rules for case folding */
+int lre_canonicalize(uint32_t c, BOOL is_unicode)
+{
+    if (c < 128) {
+        /* fast case */
+        if (is_unicode) {
+            if (c >= 'A' && c <= 'Z') {
+                c = c - 'A' + 'a';
+            }
+        } else {
+            if (c >= 'a' && c <= 'z') {
+                c = c - 'a' + 'A';
+            }
+        }
+    } else {
+        uint32_t v, code, len;
+        int idx, idx_min, idx_max;
+
+        idx_min = 0;
+        idx_max = countof(case_conv_table1) - 1;
+        while (idx_min <= idx_max) {
+            idx = (unsigned)(idx_max + idx_min) / 2;
+            v = case_conv_table1[idx];
+            code = v >> (32 - 17);
+            len = (v >> (32 - 17 - 7)) & 0x7f;
+            if (c < code) {
+                idx_max = idx - 1;
+            } else if (c >= code + len) {
+                idx_min = idx + 1;
+            } else {
+                return lre_case_folding_entry(c, idx, v, is_unicode);
+            }
+        }
+    }
+    return c;
+}
+
 static uint32_t get_le24(const uint8_t *ptr)
 {
-#if defined(__x86__) || defined(__x86_64__)
-    return *(uint16_t *)ptr | (ptr[2] << 16);
-#else
     return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
-#endif
 }
 
 #define UNICODE_INDEX_BLOCK_LEN 32
@@ -208,12 +307,20 @@ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
     uint32_t code, b, bit;
     int pos;
     const uint8_t *p;
-    
+
     pos = get_index_pos(&code, c, index_table, index_table_len);
     if (pos < 0)
         return FALSE; /* outside the table */
     p = table + pos;
     bit = 0;
+    /* Compressed run length encoding:
+       00..3F: 2 packed lengths: 3-bit + 3-bit
+       40..5F: 5-bits plus extra byte for length
+       60..7F: 5-bits plus 2 extra bytes for length
+       80..FF: 7-bit length
+       lengths must be incremented to get character count
+       Ranges alternate between false and true return value.
+     */
     for(;;) {
         b = *p++;
         if (b < 64) {
@@ -241,7 +348,7 @@ BOOL lre_is_cased(uint32_t c)
 {
     uint32_t v, code, len;
     int idx, idx_min, idx_max;
-        
+
     idx_min = 0;
     idx_max = countof(case_conv_table1) - 1;
     while (idx_min <= idx_max) {
@@ -300,7 +407,7 @@ int cr_realloc(CharRange *cr, int size)
 {
     int new_size;
     uint32_t *new_buf;
-    
+
     if (size > cr->size) {
         new_size = max_int(size, cr->size * 3 / 2);
         new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
@@ -327,7 +434,7 @@ static void cr_compress(CharRange *cr)
 {
     int i, j, k, len;
     uint32_t *pt;
-    
+
     pt = cr->points;
     len = cr->len;
     i = 0;
@@ -357,7 +464,7 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
 {
     int a_idx, b_idx, is_in;
     uint32_t v;
-    
+
     a_idx = 0;
     b_idx = 0;
     for(;;) {
@@ -658,7 +765,7 @@ static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
 {
     uint32_t v, type, is_compat, code, len;
     int idx_min, idx_max, idx;
-    
+
     idx_min = 0;
     idx_max = countof(unicode_decomp_table1) - 1;
     while (idx_min <= idx_max) {
@@ -688,7 +795,7 @@ static int unicode_compose_pair(uint32_t c0, uint32_t c1)
     uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
     int idx_min, idx_max, idx, d;
     uint32_t pair[2];
-    
+
     idx_min = 0;
     idx_max = countof(unicode_comp_table) - 1;
     while (idx_min <= idx_max) {
@@ -724,12 +831,19 @@ static int unicode_get_cc(uint32_t c)
     uint32_t code, n, type, cc, c1, b;
     int pos;
     const uint8_t *p;
-    
+
     pos = get_index_pos(&code, c,
                         unicode_cc_index, sizeof(unicode_cc_index) / 3);
     if (pos < 0)
         return 0;
     p = unicode_cc_table + pos;
+    /* Compressed run length encoding:
+       - 2 high order bits are combining class type
+       -         0:0, 1:230, 2:extra byte linear progression, 3:extra byte
+       - 00..2F: range length (add 1)
+       - 30..37: 3-bit range-length + 1 extra byte
+       - 38..3F: 3-bit range-length + 2 extra byte
+     */
     for(;;) {
         b = *p++;
         type = b >> 6;
@@ -773,7 +887,7 @@ static int unicode_get_cc(uint32_t c)
 static void sort_cc(int *buf, int len)
 {
     int i, j, k, cc, cc1, start, ch1;
-    
+
     for(i = 0; i < len; i++) {
         cc = unicode_get_cc(buf[i]);
         if (cc != 0) {
@@ -812,7 +926,7 @@ static void to_nfd_rec(DynBuf *dbuf,
     uint32_t c, v;
     int i, l;
     uint32_t res[UNICODE_DECOMP_LEN_MAX];
-    
+
     for(i = 0; i < src_len; i++) {
         c = src[i];
         if (c >= 0xac00 && c < 0xd7a4) {
@@ -857,7 +971,7 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
     int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
     BOOL is_compat;
     DynBuf dbuf_s, *dbuf = &dbuf_s;
-    
+
     is_compat = n_type >> 1;
 
     dbuf_init2(dbuf, opaque, realloc_func);
@@ -885,15 +999,15 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
     }
     buf = (int *)dbuf->buf;
     buf_len = dbuf->size / sizeof(int);
-        
+
     sort_cc(buf, buf_len);
-    
+
     if (buf_len <= 1 || (n_type & 1) != 0) {
         /* NFD / NFKD */
         *pdst = (uint32_t *)buf;
         return buf_len;
     }
-    
+
     i = 1;
     out_len = 1;
     while (i < buf_len) {
@@ -930,7 +1044,7 @@ static int unicode_find_name(const char *name_table, const char *name)
     const char *p, *r;
     int pos;
     size_t name_len, len;
-    
+
     p = name_table;
     pos = 0;
     name_len = strlen(name);
@@ -963,13 +1077,13 @@ int unicode_script(CharRange *cr,
     CharRange cr1_s, *cr1;
     CharRange cr2_s, *cr2 = &cr2_s;
     BOOL is_common;
-    
+
     script_idx = unicode_find_name(unicode_script_name_table, script_name);
     if (script_idx < 0)
         return -2;
     /* Note: we remove the "Unknown" Script */
     script_idx += UNICODE_SCRIPT_Unknown + 1;
-        
+
     is_common = (script_idx == UNICODE_SCRIPT_Common ||
                  script_idx == UNICODE_SCRIPT_Inherited);
     if (is_ext) {
@@ -1082,6 +1196,15 @@ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
     p = unicode_gc_table;
     p_end = unicode_gc_table + countof(unicode_gc_table);
     c = 0;
+    /* Compressed range encoding:
+       initial byte:
+       bits 0..4: category number (special case 31)
+       bits 5..7: range length (add 1)
+       special case bits 5..7 == 7: read an extra byte
+       - 00..7F: range length (add 7 + 1)
+       - 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
+       - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
+     */
     while (p < p_end) {
         b = *p++;
         n = b >> 5;
@@ -1135,6 +1258,14 @@ static int unicode_prop1(CharRange *cr, int prop_idx)
     p_end = p + unicode_prop_len_table[prop_idx];
     c = 0;
     bit = 0;
+    /* Compressed range encoding:
+       00..3F: 2 packed lengths: 3-bit + 3-bit
+       40..5F: 5-bits plus extra byte for length
+       60..7F: 5-bits plus 2 extra bytes for length
+       80..FF: 7-bit length
+       lengths must be incremented to get character count
+       Ranges alternate between false and true return value.
+     */
     while (p < p_end) {
         c0 = c;
         b = *p++;
@@ -1179,11 +1310,11 @@ static int unicode_case1(CharRange *cr, int case_mask)
 #define MR(x) (1 << RUN_TYPE_ ## x)
     const uint32_t tab_run_mask[3] = {
         MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
-        MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3),
+        MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
 
-        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2),
+        MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
 
-        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT),
+        MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
     };
 #undef MR
     uint32_t mask, v, code, type, len, i, idx;
@@ -1236,7 +1367,136 @@ static int unicode_case1(CharRange *cr, int case_mask)
     }
     return 0;
 }
-        
+
+static int point_cmp(const void *p1, const void *p2, void *arg)
+{
+    uint32_t v1 = *(uint32_t *)p1;
+    uint32_t v2 = *(uint32_t *)p2;
+    return (v1 > v2) - (v1 < v2);
+}
+
+static void cr_sort_and_remove_overlap(CharRange *cr)
+{
+    uint32_t start, end, start1, end1, i, j;
+
+    /* the resulting ranges are not necessarily sorted and may overlap */
+    rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
+    j = 0;
+    for(i = 0; i < cr->len; ) {
+        start = cr->points[i];
+        end = cr->points[i + 1];
+        i += 2;
+        while (i < cr->len) {
+            start1 = cr->points[i];
+            end1 = cr->points[i + 1];
+            if (start1 > end) {
+                /* |------|
+                 *           |-------| */
+                break;
+            } else if (end1 <= end) {
+                /* |------|
+                 *    |--| */
+                i += 2;
+            } else {
+                /* |------|
+                 *     |-------| */
+                end = end1;
+                i += 2;
+            }
+        }
+        cr->points[j] = start;
+        cr->points[j + 1] = end;
+        j += 2;
+    }
+    cr->len = j;
+}
+
+/* canonicalize a character set using the JS regex case folding rules
+   (see lre_canonicalize()) */
+int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
+{
+    CharRange cr_inter, cr_mask, cr_result, cr_sub;
+    uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
+
+    cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
+    cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
+
+    if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
+        goto fail;
+    if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
+        goto fail;
+
+    if (cr_invert(&cr_mask))
+        goto fail;
+    if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
+        goto fail;
+
+    /* cr_inter = cr & cr_mask */
+    /* cr_sub = cr & ~cr_mask */
+
+    /* use the case conversion table to compute the result */
+    d_start = -1;
+    d_end = -1;
+    idx = 0;
+    v = case_conv_table1[idx];
+    code = v >> (32 - 17);
+    len = (v >> (32 - 17 - 7)) & 0x7f;
+    for(i = 0; i < cr_inter.len; i += 2) {
+        start = cr_inter.points[i];
+        end = cr_inter.points[i + 1];
+
+        for(c = start; c < end; c++) {
+            for(;;) {
+                if (c >= code && c < code + len)
+                    break;
+                idx++;
+                assert(idx < countof(case_conv_table1));
+                v = case_conv_table1[idx];
+                code = v >> (32 - 17);
+                len = (v >> (32 - 17 - 7)) & 0x7f;
+            }
+            d = lre_case_folding_entry(c, idx, v, is_unicode);
+            /* try to merge with the current interval */
+            if (d_start == -1) {
+                d_start = d;
+                d_end = d + 1;
+            } else if (d_end == d) {
+                d_end++;
+            } else {
+                cr_add_interval(&cr_result, d_start, d_end);
+                d_start = d;
+                d_end = d + 1;
+            }
+        }
+    }
+    if (d_start != -1) {
+        if (cr_add_interval(&cr_result, d_start, d_end))
+            goto fail;
+    }
+
+    /* the resulting ranges are not necessarily sorted and may overlap */
+    cr_sort_and_remove_overlap(&cr_result);
+
+    /* or with the character not affected by the case folding */
+    cr->len = 0;
+    if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
+        goto fail;
+
+    cr_free(&cr_inter);
+    cr_free(&cr_mask);
+    cr_free(&cr_result);
+    cr_free(&cr_sub);
+    return 0;
+ fail:
+    cr_free(&cr_inter);
+    cr_free(&cr_mask);
+    cr_free(&cr_result);
+    cr_free(&cr_sub);
+    return -1;
+}
+
 typedef enum {
     POP_GC,
     POP_PROP,
@@ -1256,7 +1516,7 @@ static int unicode_prop_ops(CharRange *cr, ...)
     CharRange stack[POP_STACK_LEN_MAX];
     int stack_len, op, ret, i;
     uint32_t a;
-    
+
     va_start(ap, cr);
     stack_len = 0;
     for(;;) {
@@ -1342,7 +1602,7 @@ int unicode_general_category(CharRange *cr, const char *gc_name)
 {
     int gc_idx;
     uint32_t gc_mask;
-    
+
     gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
     if (gc_idx < 0)
         return -2;
@@ -1360,7 +1620,7 @@ int unicode_general_category(CharRange *cr, const char *gc_name)
 int unicode_prop(CharRange *cr, const char *prop_name)
 {
     int prop_idx, ret;
-    
+
     prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
     if (prop_idx < 0)
         return -2;
@@ -1554,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name)
 }
 
 #endif /* CONFIG_ALL_UNICODE */
+
+/*---- lre codepoint categorizing functions ----*/
+
+#define S  UNICODE_C_SPACE
+#define D  UNICODE_C_DIGIT
+#define X  UNICODE_C_XDIGIT
+#define U  UNICODE_C_UPPER
+#define L  UNICODE_C_LOWER
+#define _  UNICODE_C_UNDER
+#define d  UNICODE_C_DOLLAR
+
+uint8_t const lre_ctype_bits[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, S, S, S, S, S, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    S, 0, 0, 0, d, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
+    X|D, X|D, 0, 0, 0, 0, 0, 0,
+
+    0, X|U, X|U, X|U, X|U, X|U, X|U, U,
+    U, U, U, U, U, U, U, U,
+    U, U, U, U, U, U, U, U,
+    U, U, U, 0, 0, 0, 0, _,
+
+    0, X|L, X|L, X|L, X|L, X|L, X|L, L,
+    L, L, L, L, L, L, L, L,
+    L, L, L, L, L, L, L, L,
+    L, L, L, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    S, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+#undef S
+#undef D
+#undef X
+#undef U
+#undef L
+#undef _
+#undef d
+
+/* code point ranges for Zs,Zl or Zp property */
+static const uint16_t char_range_s[] = {
+    10,
+    0x0009, 0x000D + 1,
+    0x0020, 0x0020 + 1,
+    0x00A0, 0x00A0 + 1,
+    0x1680, 0x1680 + 1,
+    0x2000, 0x200A + 1,
+    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
+    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
+    0x2028, 0x2029 + 1,
+    0x202F, 0x202F + 1,
+    0x205F, 0x205F + 1,
+    0x3000, 0x3000 + 1,
+    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
+    0xFEFF, 0xFEFF + 1,
+};
+
+BOOL lre_is_space_non_ascii(uint32_t c)
+{
+    size_t i, n;
+
+    n = countof(char_range_s);
+    for(i = 5; i < n; i += 2) {
+        uint32_t low = char_range_s[i];
+        uint32_t high = char_range_s[i + 1];
+        if (c < low)
+            return FALSE;
+        if (c < high)
+            return TRUE;
+    }
+    return FALSE;
+}
diff --git a/libregexp/libunicode.h b/libregexp/libunicode.h
index cfa600a..cc2f244 100644
--- a/libregexp/libunicode.h
+++ b/libregexp/libunicode.h
@@ -1,6 +1,6 @@
 /*
  * Unicode utilities
- * 
+ *
  * Copyright (c) 2017-2018 Fabrice Bellard
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -24,26 +24,13 @@
 #ifndef LIBUNICODE_H
 #define LIBUNICODE_H
 
-#include <inttypes.h>
-
-#define LRE_BOOL  int       /* for documentation purposes */
+#include <stdint.h>
 
 /* define it to include all the unicode tables (40KB larger) */
 #define CONFIG_ALL_UNICODE
 
 #define LRE_CC_RES_LEN_MAX 3
 
-typedef enum {
-    UNICODE_NFC,
-    UNICODE_NFD,
-    UNICODE_NFKC,
-    UNICODE_NFKD,
-} UnicodeNormalizationEnum;
-
-int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
-LRE_BOOL lre_is_cased(uint32_t c);
-LRE_BOOL lre_is_case_ignorable(uint32_t c);
-
 /* char ranges */
 
 typedef struct {
@@ -101,10 +88,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
 
 int cr_invert(CharRange *cr);
 
-#ifdef CONFIG_ALL_UNICODE
+int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
 
-LRE_BOOL lre_is_id_start(uint32_t c);
-LRE_BOOL lre_is_id_continue(uint32_t c);
+typedef enum {
+    UNICODE_NFC,
+    UNICODE_NFD,
+    UNICODE_NFKC,
+    UNICODE_NFKD,
+} UnicodeNormalizationEnum;
 
 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
                       UnicodeNormalizationEnum n_type,
@@ -112,13 +103,80 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
 
 /* Unicode character range functions */
 
-int unicode_script(CharRange *cr,
-                   const char *script_name, LRE_BOOL is_ext);
+int unicode_script(CharRange *cr, const char *script_name, int is_ext);
 int unicode_general_category(CharRange *cr, const char *gc_name);
 int unicode_prop(CharRange *cr, const char *prop_name);
 
-#endif /* CONFIG_ALL_UNICODE */
+int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
+int lre_canonicalize(uint32_t c, int is_unicode);
+
+/* Code point type categories */
+enum {
+    UNICODE_C_SPACE  = (1 << 0),
+    UNICODE_C_DIGIT  = (1 << 1),
+    UNICODE_C_UPPER  = (1 << 2),
+    UNICODE_C_LOWER  = (1 << 3),
+    UNICODE_C_UNDER  = (1 << 4),
+    UNICODE_C_DOLLAR = (1 << 5),
+    UNICODE_C_XDIGIT = (1 << 6),
+};
+extern uint8_t const lre_ctype_bits[256];
+
+/* zero or non-zero return value */
+int lre_is_cased(uint32_t c);
+int lre_is_case_ignorable(uint32_t c);
+int lre_is_id_start(uint32_t c);
+int lre_is_id_continue(uint32_t c);
+
+static inline int lre_is_space_byte(uint8_t c) {
+    return lre_ctype_bits[c] & UNICODE_C_SPACE;
+}
+
+static inline int lre_is_id_start_byte(uint8_t c) {
+    return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+                                UNICODE_C_UNDER | UNICODE_C_DOLLAR);
+}
 
-#undef LRE_BOOL
+static inline int lre_is_id_continue_byte(uint8_t c) {
+    return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+                                UNICODE_C_UNDER | UNICODE_C_DOLLAR |
+                                UNICODE_C_DIGIT);
+}
+
+int lre_is_space_non_ascii(uint32_t c);
+
+static inline int lre_is_space(uint32_t c) {
+    if (c < 256)
+        return lre_is_space_byte(c);
+    else
+        return lre_is_space_non_ascii(c);
+}
+
+static inline int lre_js_is_ident_first(uint32_t c) {
+    if (c < 128) {
+        return lre_is_id_start_byte(c);
+    } else {
+#ifdef CONFIG_ALL_UNICODE
+        return lre_is_id_start(c);
+#else
+        return !lre_is_space_non_ascii(c);
+#endif
+    }
+}
+
+static inline int lre_js_is_ident_next(uint32_t c) {
+    if (c < 128) {
+        return lre_is_id_continue_byte(c);
+    } else {
+        /* ZWNJ and ZWJ are accepted in identifiers */
+        if (c >= 0x200C && c <= 0x200D)
+            return TRUE;
+#ifdef CONFIG_ALL_UNICODE
+        return lre_is_id_continue(c);
+#else
+        return !lre_is_space_non_ascii(c);
+#endif
+    }
+}
 
 #endif /* LIBUNICODE_H */

From 5f64d0ac2b409fd8feccd22e2401e6e438135a94 Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Tue, 25 Jun 2024 16:31:51 +0200
Subject: [PATCH 5/8] implement d flag

---
 jsregexp.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/jsregexp.c b/jsregexp.c
index e01d2be..9452cea 100644
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -335,6 +335,7 @@ static int regexp_gc(lua_State *lstate) {
 
 static void regexp_pushflags(lua_State *lstate, const struct regexp *r) {
   const int flags = lre_get_flags(r->bc);
+  const char *indices = (flags & LRE_FLAG_INDICES) ? "d" : "";
   const char *ignorecase = (flags & LRE_FLAG_IGNORECASE) ? "i" : "";
   const char *global = (flags & LRE_FLAG_GLOBAL) ? "g" : "";
   const char *multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : "";
@@ -342,8 +343,8 @@ static void regexp_pushflags(lua_State *lstate, const struct regexp *r) {
   const char *dotall = (flags & LRE_FLAG_DOTALL) ? "s" : "";
   const char *utf16 = (flags & LRE_FLAG_UNICODE) ? "u" : "";
   const char *sticky = (flags & LRE_FLAG_STICKY) ? "y" : "";
-  lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline,
-                  named_groups, dotall, utf16, sticky);
+  lua_pushfstring(lstate, "%s%s%s%s%s%s%s%s", indices, ignorecase, global,
+                  multiline, named_groups, dotall, utf16, sticky);
 }
 
 static int regexp_tostring(lua_State *lstate) {
@@ -394,6 +395,7 @@ static int regexp_exec(lua_State *lstate) {
 
   const int capture_count = lre_get_capture_count(r->bc);
   const char *group_names = lre_get_groupnames(r->bc);
+  const bool has_indices = lre_get_flags(r->bc) & LRE_FLAG_INDICES;
 
   const int ret =
       lre_exec(capture, r->bc, (uint8_t *)input->u.str8, rlast_index,
@@ -438,34 +440,94 @@ static int regexp_exec(lua_State *lstate) {
   }
   lua_setfield(lstate, -2, "index");
 
+  if (has_indices) {
+    // [match]
+    lua_createtable(lstate, capture_count + 1, 0); // match.indices
+    // [match, indices]
+    if (group_names) {
+      // push indices.groups table, duplicate it and leave it below match
+      lua_createtable(lstate, 0, capture_count); // match.indices.groups
+      // [match, indices, groups]
+      lua_pushvalue(lstate, -1);
+      // [match, indices, groups, groups]
+      lua_insert(lstate, -4);
+      // [indices.groups, match, indices, groups]
+      lua_setfield(lstate, -2, "groups");
+      // [indices.groups, match, indices]
+    }
+    lua_pushvalue(lstate, -1);
+    // [..., match, indices, indices]
+    lua_setfield(lstate, -3, "indices");
+    // [..., match, indices]
+    lua_insert(lstate, -2); // leave table below the match table
+    // [..., indices, match]
+  }
+
   if (group_names) {
+    // [..., match]
     lua_newtable(lstate); // match.groups
+    // [..., match, groups]
     lua_pushvalue(lstate, -1);
+    // [..., match, groups, groups]
     lua_setfield(lstate, -3, "groups"); // immediately insert into match
-    lua_insert(lstate, -2);             // leave table below the match table
+    // [..., match, groups]
+    lua_insert(lstate, -2); // leave table below the match table
+    // [..., groups, match]
   }
 
+  // [groups.indices?, indices?, groups?, match]
+
   for (int i = 0; i < capture_count; i++) {
+    uint32_t a, b;
     if (input->is_wide_char) {
-      const uint32_t a = input->indices[(capture[2 * i] - input->u.str8) / 2];
-      const uint32_t b =
-          input->indices[(capture[2 * i + 1] - input->u.str8) / 2];
+      a = input->indices[(capture[2 * i] - input->u.str8) / 2];
+      b = input->indices[(capture[2 * i + 1] - input->u.str8) / 2];
       lua_pushlstring(lstate, input->bstr + a, b - a);
     } else {
+      a = capture[2 * i] - input->u.str8;
+      b = capture[2 * i + 1] - input->u.str8;
       lua_pushlstring(lstate, (char *)capture[2 * i],
                       capture[2 * i + 1] - capture[2 * i]);
     }
+
+    if (has_indices) {
+      lua_createtable(lstate, 2, 0);
+      lua_pushinteger(lstate, a + 1);
+      lua_rawseti(lstate, -2, 1);
+      lua_pushinteger(lstate, b);
+      lua_rawseti(lstate, -2, 2);
+      // [..., match, string, {a, b}]
+      if (group_names) {
+        // [indices.groups, indices, groups, match, string, {a, b}]
+        if (i > 0 && *group_names) {
+          // if the current group is named, duplicate and insert into the
+          // correct table
+          lua_pushvalue(lstate, -1);
+          // [indices.groups, indices, groups, match, string, {a, b}, {a,b}]
+          lua_setfield(lstate, -7, group_names);
+        }
+        // [indices.groups, indices, groups, match, string, {a, b}]
+        lua_rawseti(lstate, -5, i);
+      } else {
+        // [indices, match, string, {a, b}]
+        lua_rawseti(lstate, -4, i);
+      }
+    }
+
     if (i > 0 && group_names) {
+      // [..., groups, match, string]
       // if the current group is named, duplicate and insert into the correct
       // table
       if (*group_names) {
         lua_pushvalue(lstate, -1);
+        // [..., groups, match, string, string]
         lua_setfield(lstate, -4, group_names);
         group_names += strlen(group_names);
       }
       group_names++;
     }
 
+    // [..., match, string]
     lua_rawseti(lstate, -2, i);
   }
 
@@ -507,6 +569,8 @@ static int regexp_index(lua_State *lstate) {
       lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_STICKY);
     } else if (streq(key, "unicode")) {
       lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_UNICODE);
+    } else if (streq(key, "has_indices")) {
+      lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_INDICES);
     } else if (streq(key, "source")) {
       lua_pushstring(lstate, r->expr);
     } else if (streq(key, "flags")) {
@@ -564,6 +628,9 @@ static int jsregexp_compile(lua_State *lstate) {
     const char *flags = luaL_checkstring(lstate, 2);
     while (*flags) {
       switch (*(flags++)) {
+      case 'd':
+        re_flags |= LRE_FLAG_INDICES;
+        break;
       case 'i':
         re_flags |= LRE_FLAG_IGNORECASE;
         break;

From 4ad55b0dd071b4d9009932c3a5208e2571e86e3d Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Tue, 25 Jun 2024 16:36:03 +0200
Subject: [PATCH 6/8] test d flag

---
 test.lua | 126 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 92 insertions(+), 34 deletions(-)

diff --git a/test.lua b/test.lua
index 7786b9f..34b8d67 100644
--- a/test.lua
+++ b/test.lua
@@ -1,4 +1,5 @@
 local jsregexp = require("jsregexp")
+local unpack = unpack or table.unpack
 
 local tests = 0
 local fails = 0
@@ -130,6 +131,50 @@ local function test_exec(str, regex, flags, want)
 				end
 			end
 		end
+		if match_wanted.indices and not match.indices then
+			return fail("expected indices table")
+		end
+		if not match_wanted.indices and match.indices then
+			return fail("expected no indices table")
+		end
+		if match_wanted.indices then
+			if match_wanted.indices.groups and not match.indices.groups then
+				return fail("expected indices.groups table")
+			end
+			if not match_wanted.indices.groups and match.indices.groups then
+				return fail("expected no indices.groups table")
+			end
+			for i = 0, #match.indices do
+				local a, b = unpack(match_wanted.indices[i])
+				local c, d = unpack(match.indices[i])
+				if a ~= c or b ~= d then
+					return fail(
+						string.format("wrong indices for group %d, expected {%d, %d}, got {%d, %d}", i, a, b, c, d)
+					)
+				end
+			end
+			if match_wanted.indices.groups then
+				for key, val in pairs(match_wanted.indices.groups) do
+					if not match_wanted.indices.groups[key] then
+						return fail(string.format("unexpected key in indices.groups: %s", key))
+					end
+					local a, b = unpack(match_wanted.indices.groups[key])
+					local c, d = unpack(val)
+					if a ~= c or b ~= d then
+						return fail(
+							string.format(
+								"wrong indices for group %s, expected {%d, %d}, got {%d, %d}",
+								key,
+								a,
+								b,
+								c,
+								d
+							)
+						)
+					end
+				end
+			end
+		end
 	end
 	local match = r:exec(str)
 	if r.global and match then
@@ -388,42 +433,39 @@ test_call(
 -- test("จงฝ่าฟันพัฒนาวิชาการ", "(จงฝ่าฟันพัฒนาวิชาการ)", "", {{"จงฝ่าฟันพัฒนาวิชาการ", groups="จงฝ่าฟันพัฒนาวิชาการ"}})
 
 -- named groups:
-test_call(
-	"The quick brown fox jumps over the lazy dog",
-	"(?<first_word>\\w+) (\\w+) (?<third_word>\\w+)",
-	"n",
-	{ { "The quick brown", groups = { "The", "quick", "brown" }, named_groups = { first_word = "The", third_word = "brown" } } }
-)
+test_call("The quick brown fox jumps over the lazy dog", "(?<first_word>\\w+) (\\w+) (?<third_word>\\w+)", "n", {
+	{
+		"The quick brown",
+		groups = { "The", "quick", "brown" },
+		named_groups = { first_word = "The", third_word = "brown" },
+	},
+})
 test_call(
 	"The qüick bröwn föx jümps över the lazy dög",
 	"(?<first_word>[^ ]+) ([^ ]+) (?<third_word>[^ ]+)",
 	"n",
-	{ { "The qüick bröwn", groups = { "The", "qüick", "bröwn" }, named_groups = {
-		first_word = "The",
-		third_word = "bröwn",
-	} } }
-)
-test_call(
-	"The quick bröwn föx",
-	"(?<first_wörd>[^ ]+) ([^ ]+) (?<third_wörd>[^ ]+)",
-	"n",
 	{
 		{
-			"The quick bröwn",
-			groups = { "The", "quick", "bröwn" },
-			named_groups = { ["first_wörd"] = "The", ["third_wörd"] = "bröwn" },
+			"The qüick bröwn",
+			groups = { "The", "qüick", "bröwn" },
+			named_groups = {
+				first_word = "The",
+				third_word = "bröwn",
+			},
 		},
 	}
 )
-test_call(
-	"𝄞𝄞 𐐷",
-	"(?<word>[^ ]+)",
-	"ng",
+test_call("The quick bröwn föx", "(?<first_wörd>[^ ]+) ([^ ]+) (?<third_wörd>[^ ]+)", "n", {
 	{
-		{ "𝄞𝄞", groups = { "𝄞𝄞" }, named_groups = { word = "𝄞𝄞" } },
-		{ "𐐷", groups = { "𐐷" }, named_groups = { word = "𐐷" } },
-	}
-)
+		"The quick bröwn",
+		groups = { "The", "quick", "bröwn" },
+		named_groups = { ["first_wörd"] = "The", ["third_wörd"] = "bröwn" },
+	},
+})
+test_call("𝄞𝄞 𐐷", "(?<word>[^ ]+)", "ng", {
+	{ "𝄞𝄞", groups = { "𝄞𝄞" }, named_groups = { word = "𝄞𝄞" } },
+	{ "𐐷", groups = { "𐐷" }, named_groups = { word = "𐐷" } },
+})
 
 test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } })
 test_exec(
@@ -432,15 +474,31 @@ test_exec(
 	"g",
 	{ { [0] = "The quick", "The", "quick" }, { [0] = "brown fox", "brown", "fox" } }
 )
-test_exec(
-	"The quick brown fox",
-	"(?<word1>\\w+) (\\w+)",
-	"g",
+test_exec("The quick brown fox", "(?<word1>\\w+) (\\w+)", "g", {
+	{ [0] = "The quick", "The", "quick", groups = { word1 = "The" } },
+	{ [0] = "brown fox", "brown", "fox", groups = { word1 = "brown" } },
+})
+
+test_exec("The Quick Brown Fox Jumps Over The Lazy Dog", "quick\\s(?<color>brown).+?(jumps)", "di", {
 	{
-		{ [0] = "The quick", "The", "quick", groups = { word1 = "The" } },
-		{ [0] = "brown fox", "brown", "fox", groups = { word1 = "brown" } },
-	}
-)
+		[0] = "Quick Brown Fox Jumps",
+		[1] = "Brown",
+		[2] = "Jumps",
+		indices = {
+			[0] = { 5, 25 },
+			[1] = { 11, 15 },
+			[2] = { 21, 25 },
+			groups = {
+				color = { 11, 15 },
+			},
+		},
+		index = 4,
+		input = "The Quick Brown Fox Jumps Over The Lazy Dog",
+		groups = {
+			color = "Brown",
+		},
+	},
+})
 
 test_test("The quick brown", "\\w+", "", { true })
 test_test("The quick brown", "\\d+", "", { false })

From f99bfa058dfc0cf7b91f3d5d369d2f847f0ff9b3 Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:19:00 +0200
Subject: [PATCH 7/8] update README.md

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index c64e1f7..387e0a8 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ jsregexp.compile_safe(regex, flags?)
 ```
 that take an ECMAScript regular expression as a string and an optional string of flags, most notably
 
+- `"d"` provide tables with begin/end indices of match groups in match objects
 - `"i"`: case insensitive search
 - `"g"`: match globally
 - `"n"`: enables named groups (not present in JavaScript, needs to be enabled manually if needed)
@@ -50,6 +51,7 @@ re.source       -- the regexp string
 re.flags        -- a string representing the active flags
 re.dot_all      -- is the dod_all flag set?
 re.global       -- is the global flag set?
+re.has_indices  -- is the indices flag set?
 re.ignore_case  -- is the ignore_case flag set?
 re.multiline    -- is the multiline flag set?
 re.sticky       -- is the sticky flag set?
@@ -88,6 +90,8 @@ m.input          -- the input string
 m.capture_count  -- number of capture groups
 m.index          -- start of the capture (1-based)
 m.groups         -- table of the named groups and their content
+m.indices        -- table of begin/end indices of all match groups (if "d" flag is set)
+m.indices.groups -- table of named groups and their begin/end indices (if "d" flag is set)
 ```
 Calling `tostring` on a match object returns the full  match `m[0]`.
 

From 30c86aca7e6ea740d035ff89f0ab14e63481f22d Mon Sep 17 00:00:00 2001
From: kmarius <5224719+kmarius@users.noreply.github.com>
Date: Fri, 5 Jul 2024 20:00:44 +0200
Subject: [PATCH 8/8] disable testing on macos

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ac07a63..2915895 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
             "luajit-2.1.0-beta3",
           ]
           # TODO: add windows-latest once: https://github.com/leafo/gh-actions-lua/pull/23 is fully released
-        machineTag: ["ubuntu-latest", "macos-latest"]
+        machineTag: ["ubuntu-latest"]
     runs-on: ${{ matrix.machineTag }}
     steps:
       - uses: actions/checkout@v2