From cfc14af7589679c18406c2d3ab0bcc0e8dbe6a35 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Mon, 22 Jan 2024 14:46:20 +0100 Subject: [PATCH 1/8] update description --- jsregexp-0.0.7-1.rockspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsregexp-0.0.7-1.rockspec b/jsregexp-0.0.7-1.rockspec index b71a9d9..62b97ad 100644 --- a/jsregexp-0.0.7-1.rockspec +++ b/jsregexp-0.0.7-1.rockspec @@ -8,7 +8,7 @@ source = { description = { summary = "javascript (ECMA19) regular expressions for lua", detailed = [[ -WIP: This library offers a single function to use javascript regular expressions in lua. It makes use of libregexp from https://bellard.org/quickjs/. +Provides ECMAScript regular expressions for Lua 5.1, 5.2, 5.3, 5.4 and LuaJit. Uses libregexp from Fabrice Bellard's QuickJS. ]], homepage = "https://github.com/kmarius/jsregexp", license = "MIT", From 331d4be26162d651be406b193b8465779e84e5bd Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:08:17 +0200 Subject: [PATCH 2/8] add .clang-format --- .clang-format | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..be9d1eb --- /dev/null +++ b/.clang-format @@ -0,0 +1,224 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: false +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterExternBlock: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAfterAttributes: Never +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Attach +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: false +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: BinPack +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +PPIndentWidth: -1 +QualifierAlignment: Leave +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... From 4b7e242dbdee135af6d7b4d90d60ca2ed71a1352 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:09:47 +0200 Subject: [PATCH 3/8] formatting --- jsregexp.c | 276 +++++++++++++++++++++++------------------------ jsregexp.lua | 294 ++++++++++++++++++++++++++++----------------------- test.lua | 210 +++++++++++++++++++++++++----------- 3 files changed, 444 insertions(+), 336 deletions(-) diff --git a/jsregexp.c b/jsregexp.c index b5f8eff..305256d 100644 --- a/jsregexp.c +++ b/jsregexp.c @@ -10,12 +10,11 @@ #include "libregexp/cutils.h" #include "libregexp/libregexp.h" -#define CAPTURE_COUNT_MAX 255 /* from libregexp.c */ +#define CAPTURE_COUNT_MAX 255 /* from libregexp.c */ #define JSREGEXP_MT "jsregexp_meta" #define JSREGEXP_MATCH_MT "jsregexp_match_meta" #define JSSTRING_MT "jsstring_meta" - #if LUA_VERSION_NUM >= 502 #define new_lib(L, l) (luaL_newlib(L, l)) #define lua_tbl_len(L, arg) (lua_rawlen(L, arg)) @@ -33,7 +32,7 @@ #define streq(X, Y) ((*(X) == *(Y)) && strcmp(X, Y) == 0) struct regexp { - char* expr; + char *expr; uint8_t *bc; uint32_t last_index; }; @@ -41,21 +40,19 @@ struct regexp { struct jsstring { bool is_wide_char; uint32_t len; - char* bstr; // base string passed in + char *bstr; // base string passed in uint32_t bstr_len; // base string length - uint32_t* indices; - uint32_t* rev_indices; + uint32_t *indices; + uint32_t *rev_indices; union { - uint8_t* str8; /* 8 bit strings will get an extra null terminator */ - uint16_t* str16; + uint8_t *str8; /* 8 bit strings will get an extra null terminator */ + uint16_t *str16; } u; }; - // check for bytes higher or equal to 0xf0 -static inline bool utf8_contains_non_bmp(const char *s) -{ - uint8_t *q = (uint8_t *) s; +static inline bool utf8_contains_non_bmp(const char *s) { + uint8_t *q = (uint8_t *)s; while (*q) { if ((*q++ & 0xf0) == 0xf0) { return true; @@ -64,9 +61,7 @@ static inline bool utf8_contains_non_bmp(const char *s) return false; } - -static inline bool utf8_contains_non_ascii(const char *s) -{ +static inline bool utf8_contains_non_ascii(const char *s) { while (*s) { if (*s++ & 0x80) { return true; @@ -75,27 +70,22 @@ static inline bool utf8_contains_non_ascii(const char *s) return false; } - // returns NULL when malformed unicode is encountered, otherwise returns the // converted string. *utf16_len will contain the length of the string and // *indices an (allocated) array mapping each utf16 code point to the utf8 code // point in the input string. -static inline uint16_t *utf8_to_utf16( - const uint8_t *input, - uint32_t n, - uint32_t *utf16_len, - uint32_t **indices, - uint32_t **rev_indices) -{ - *indices = calloc((n+1), sizeof **indices); +static inline uint16_t *utf8_to_utf16(const uint8_t *input, uint32_t n, + uint32_t *utf16_len, uint32_t **indices, + uint32_t **rev_indices) { + *indices = calloc((n + 1), sizeof **indices); // TODO: lazy way of doing it, later implement using binary search tree - *rev_indices = calloc((n+1), sizeof **indices); - uint16_t *str = malloc((n+1) * sizeof *str); + *rev_indices = calloc((n + 1), sizeof **indices); + uint16_t *str = malloc((n + 1) * sizeof *str); uint16_t *q = str; const uint8_t *pos = input; while (*pos) { - (*indices)[q-str] = pos - input; - (*rev_indices)[pos - input] = q-str; + (*indices)[q - str] = pos - input; + (*rev_indices)[pos - input] = q - str; int c = unicode_from_utf8(pos, UTF8_CHAR_LEN_MAX, &pos); if (c == -1) { // malformed @@ -104,7 +94,7 @@ static inline uint16_t *utf8_to_utf16( free(*rev_indices); return NULL; } - if ((unsigned) c > 0xffff) { + if ((unsigned)c > 0xffff) { *q++ = (((c - 0x10000) >> 10) | (0xd8 << 8)); *q++ = (c & 0xfffff) | (0xdc << 8); } else { @@ -119,22 +109,22 @@ static inline uint16_t *utf8_to_utf16( return str; } - -static int jsstring_new(lua_State* lstate) { - if(lua_isuserdata(lstate, 1)) { +static int jsstring_new(lua_State *lstate) { + if (lua_isuserdata(lstate, 1)) { luaL_checkudata(lstate, 1, JSSTRING_MT); lua_pushvalue(lstate, 1); return 1; } size_t input_len; - const uint8_t* input = (uint8_t*)luaL_checklstring(lstate, 1, &input_len); - struct jsstring* ud; - if (utf8_contains_non_ascii((char *) input)) { + const uint8_t *input = (uint8_t *)luaL_checklstring(lstate, 1, &input_len); + struct jsstring *ud; + if (utf8_contains_non_ascii((char *)input)) { uint32_t *indices; uint32_t *rev_indices; uint32_t input_utf16_len; - uint16_t *input_utf16 = utf8_to_utf16(input, input_len, &input_utf16_len, &indices, &rev_indices); + uint16_t *input_utf16 = utf8_to_utf16(input, input_len, &input_utf16_len, + &indices, &rev_indices); if (!input_utf16) { luaL_error(lstate, "malformed unicode"); @@ -144,7 +134,7 @@ static int jsstring_new(lua_State* lstate) { ud->is_wide_char = true; ud->len = input_utf16_len; ud->u.str16 = input_utf16; - ud->bstr = strdup((char*)input); + ud->bstr = strdup((char *)input); ud->bstr_len = input_len; ud->indices = indices; ud->rev_indices = rev_indices; @@ -153,8 +143,8 @@ static int jsstring_new(lua_State* lstate) { ud->is_wide_char = false; ud->len = input_len; ud->bstr_len = input_len; - ud->u.str8 =(uint8_t*) strdup((char*)input); - ud->bstr = (char*)ud->u.str8; + ud->u.str8 = (uint8_t *)strdup((char *)input); + ud->bstr = (char *)ud->u.str8; ud->indices = NULL; ud->rev_indices = NULL; } @@ -163,7 +153,7 @@ static int jsstring_new(lua_State* lstate) { return 1; } -static int jsstring_gc(lua_State* lstate) { +static int jsstring_gc(lua_State *lstate) { struct jsstring *s = lua_touserdata(lstate, 1); free(s->u.str8); free(s->indices); @@ -175,26 +165,22 @@ static int jsstring_gc(lua_State* lstate) { return 0; } -static struct luaL_Reg jsstring_meta[] = { - {"__gc", jsstring_gc}, - {NULL, NULL} -}; +static struct luaL_Reg jsstring_meta[] = {{"__gc", jsstring_gc}, {NULL, NULL}}; -static inline struct jsstring* lua_tojsstring(lua_State *lstate, int arg) { +static inline struct jsstring *lua_tojsstring(lua_State *lstate, int arg) { if (lua_isuserdata(lstate, arg)) { // already jsstring - return (struct jsstring*) luaL_checkudata(lstate, arg, JSSTRING_MT); + return (struct jsstring *)luaL_checkudata(lstate, arg, JSSTRING_MT); } else { // coerce to jsstring lua_pushcfunction(lstate, jsstring_new); lua_insert(lstate, arg); lua_call(lstate, 1, 1); - return (struct jsstring*) luaL_checkudata(lstate, arg, JSSTRING_MT); + return (struct jsstring *)luaL_checkudata(lstate, arg, JSSTRING_MT); } } -static int regexp_call(lua_State *lstate) -{ +static int regexp_call(lua_State *lstate) { uint8_t *capture[CAPTURE_COUNT_MAX * 2]; struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT); @@ -202,17 +188,19 @@ static int regexp_call(lua_State *lstate) const int named_groups = lre_get_flags(r->bc) & LRE_FLAG_NAMED_GROUPS; const int capture_count = lre_get_capture_count(r->bc); - struct jsstring* input = lua_tojsstring(lstate, 2); + struct jsstring *input = lua_tojsstring(lstate, 2); int nmatch = 0; int cindex = 0; if (input->is_wide_char) { lua_newtable(lstate); - while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 1, NULL) == 1) { + while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 1, + NULL) == 1) { if (capture[0] == capture[1]) { - // empty match -> continue matching from next character (to prevent an endless loop). - // This is basically the same implementation as in quickjs, see + // empty match -> continue matching from next character (to prevent an + // endless loop). This is basically the same implementation as in + // quickjs, see // https://github.com/bellard/quickjs/blob/2788d71e823b522b178db3b3660ce93689534e6d/quickjs.c#L42857-L42869 cindex++; @@ -221,12 +209,13 @@ static int regexp_call(lua_State *lstate) cindex++; } } else { - cindex = (capture[1] - (uint8_t *) input->u.str16) / 2; + cindex = (capture[1] - (uint8_t *)input->u.str16) / 2; } lua_newtable(lstate); - lua_pushnumber(lstate, 1 + input->indices[(capture[0] - input->u.str8) / 2]); + lua_pushnumber(lstate, + 1 + input->indices[(capture[0] - input->u.str8) / 2]); lua_setfield(lstate, -2, "begin_ind"); lua_pushnumber(lstate, input->indices[(capture[1] - input->u.str8) / 2]); @@ -234,21 +223,23 @@ static int regexp_call(lua_State *lstate) lua_newtable(lstate); - const char* group_names = NULL; + const char *group_names = NULL; if (named_groups) { lua_newtable(lstate); group_names = lre_get_groupnames(r->bc); } for (int i = 1; i < capture_count; i++) { - const uint32_t a = input->indices[(capture[2*i] - input->u.str8) / 2]; - const uint32_t b = input->indices[(capture[2*i+1] - input->u.str8) / 2]; - lua_pushlstring(lstate, input->bstr+a, b-a); + const uint32_t a = input->indices[(capture[2 * i] - input->u.str8) / 2]; + const uint32_t b = + input->indices[(capture[2 * i + 1] - input->u.str8) / 2]; + lua_pushlstring(lstate, input->bstr + a, b - a); lua_rawseti(lstate, -2, i); if (named_groups && group_names != NULL) { if (*group_names != '\0') { // check if current group is named - lua_pushlstring(lstate, input->bstr+a, b-a); + lua_pushlstring(lstate, input->bstr + a, b - a); lua_setfield(lstate, -3, group_names); - group_names += strlen(group_names) + 1; // move to the next group name + group_names += + strlen(group_names) + 1; // move to the next group name } else { group_names += 1; // move to the next group name } @@ -270,7 +261,8 @@ static int regexp_call(lua_State *lstate) } } else { lua_newtable(lstate); - while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 0, NULL) == 1) { + while (lre_exec(capture, r->bc, input->u.str8, cindex, input->len, 0, + NULL) == 1) { if (capture[0] == capture[1]) { cindex++; } else { @@ -287,19 +279,22 @@ static int regexp_call(lua_State *lstate) lua_newtable(lstate); - const char* group_names = NULL; + const char *group_names = NULL; if (named_groups) { lua_newtable(lstate); group_names = lre_get_groupnames(r->bc); } for (int i = 1; i < capture_count; i++) { - lua_pushlstring(lstate, (char *) capture[2 * i], capture[2 * i + 1] - capture[2 * i]); + lua_pushlstring(lstate, (char *)capture[2 * i], + capture[2 * i + 1] - capture[2 * i]); lua_rawseti(lstate, -2, i); if (named_groups && group_names != NULL) { if (*group_names != '\0') { // check if current group is named - lua_pushlstring(lstate, (char *) capture[2 * i], capture[2 * i + 1] - capture[2 * i]); + lua_pushlstring(lstate, (char *)capture[2 * i], + capture[2 * i + 1] - capture[2 * i]); lua_setfield(lstate, -3, group_names); - group_names += strlen(group_names) + 1; // move to the next group name + group_names += + strlen(group_names) + 1; // move to the next group name } else { group_names += 1; // move to the next group name } @@ -323,29 +318,27 @@ static int regexp_call(lua_State *lstate) return 1; } - -static int regexp_gc(lua_State *lstate) -{ +static int regexp_gc(lua_State *lstate) { struct regexp *r = lua_touserdata(lstate, 1); free(r->bc); free(r->expr); return 0; } -static void regexp_pushflags(lua_State* lstate, const struct regexp *r) { +static void regexp_pushflags(lua_State *lstate, const struct regexp *r) { const int flags = lre_get_flags(r->bc); - const char* ignorecase = (flags & LRE_FLAG_IGNORECASE) ? "i" : ""; - const char* global = (flags & LRE_FLAG_GLOBAL) ? "g" : ""; - const char* multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : ""; - const char* named_groups = (flags & LRE_FLAG_NAMED_GROUPS) ? "n" : ""; - const char* dotall = (flags & LRE_FLAG_DOTALL) ? "s" : ""; - const char* utf16 = (flags & LRE_FLAG_UTF16) ? "u" : ""; - const char* sticky = (flags & LRE_FLAG_STICKY) ? "y" : ""; - lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline, named_groups, dotall, utf16, sticky); + const char *ignorecase = (flags & LRE_FLAG_IGNORECASE) ? "i" : ""; + const char *global = (flags & LRE_FLAG_GLOBAL) ? "g" : ""; + const char *multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : ""; + const char *named_groups = (flags & LRE_FLAG_NAMED_GROUPS) ? "n" : ""; + const char *dotall = (flags & LRE_FLAG_DOTALL) ? "s" : ""; + const char *utf16 = (flags & LRE_FLAG_UTF16) ? "u" : ""; + const char *sticky = (flags & LRE_FLAG_STICKY) ? "y" : ""; + lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline, + named_groups, dotall, utf16, sticky); } -static int regexp_tostring(lua_State *lstate) -{ +static int regexp_tostring(lua_State *lstate) { const struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT); lua_pushfstring(lstate, "/%s/", r->expr); regexp_pushflags(lstate, r); @@ -353,28 +346,24 @@ static int regexp_tostring(lua_State *lstate) return 1; } - // automatic conversion to the global match string -static int match_tostring(lua_State *lstate) -{ - //luaL_getmetatable(lstate, JSREGEXP_MATCH); - //if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) { - // luaL_argerror(lstate, 1, "match object expected"); - //} +static int match_tostring(lua_State *lstate) { + // luaL_getmetatable(lstate, JSREGEXP_MATCH); + // if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) { + // luaL_argerror(lstate, 1, "match object expected"); + // } lua_rawgeti(lstate, 1, 0); return 1; } - // repeatedly running regexp:match(input) is not a good idea because we would // convert the string (at least from last_ind) to utf16 every time (if it is // needed) -static int regexp_exec(lua_State *lstate) -{ +static int regexp_exec(lua_State *lstate) { uint8_t *capture[CAPTURE_COUNT_MAX * 2]; struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT); - const struct jsstring* input = lua_tojsstring(lstate, 2); + const struct jsstring *input = lua_tojsstring(lstate, 2); const int global = lre_get_flags(r->bc) & LRE_FLAG_GLOBAL; const int sticky = lre_get_flags(r->bc) & LRE_FLAG_STICKY; @@ -396,10 +385,11 @@ static int regexp_exec(lua_State *lstate) } const int capture_count = lre_get_capture_count(r->bc); - const char* group_names = lre_get_groupnames(r->bc); + const char *group_names = lre_get_groupnames(r->bc); - const int ret = lre_exec(capture, r->bc, (uint8_t *) input->u.str8, rlast_index, - input->len, input->is_wide_char ? 1 : 0, NULL); + const int ret = + lre_exec(capture, r->bc, (uint8_t *)input->u.str8, rlast_index, + input->len, input->is_wide_char ? 1 : 0, NULL); if (ret < 0) { luaL_error(lstate, "out of memory in regexp execution"); @@ -411,7 +401,7 @@ static int regexp_exec(lua_State *lstate) r->last_index = 0; } return 0; - } else if (global || sticky) { + } else if (global || sticky) { // match found if (input->is_wide_char) { r->last_index = input->indices[(capture[1] - input->u.str8) / 2]; @@ -431,16 +421,17 @@ static int regexp_exec(lua_State *lstate) lua_pushinteger(lstate, capture_count); lua_setfield(lstate, -2, "capture_count"); - if (input->is_wide_char) { - lua_pushnumber(lstate, 1 + input->indices[(capture[0] - input->u.str8) / 2]); // 1-based + lua_pushnumber( + lstate, + 1 + input->indices[(capture[0] - input->u.str8) / 2]); // 1-based } else { lua_pushnumber(lstate, 1 + capture[0] - input->u.str8); // 1-based } lua_setfield(lstate, -2, "index"); if (group_names) { - lua_newtable(lstate); // match.groups + lua_newtable(lstate); // match.groups lua_pushvalue(lstate, -1); lua_setfield(lstate, -3, "groups"); // immediately insert into match lua_insert(lstate, -2); // leave table below the match table @@ -448,11 +439,13 @@ static int regexp_exec(lua_State *lstate) for (int i = 0; i < capture_count; i++) { if (input->is_wide_char) { - const uint32_t a = input->indices[(capture[2*i] - input->u.str8) / 2]; - const uint32_t b = input->indices[(capture[2*i+1] - input->u.str8) / 2]; - lua_pushlstring(lstate, input->bstr+a, b-a); + const uint32_t a = input->indices[(capture[2 * i] - input->u.str8) / 2]; + const uint32_t b = + input->indices[(capture[2 * i + 1] - input->u.str8) / 2]; + lua_pushlstring(lstate, input->bstr + a, b - a); } else { - lua_pushlstring(lstate, (char *) capture[2*i], capture[2*i+1] - capture[2*i]); + lua_pushlstring(lstate, (char *)capture[2 * i], + capture[2 * i + 1] - capture[2 * i]); } if (i > 0 && group_names) { // if the current group is named, duplicate and insert into the correct @@ -471,9 +464,7 @@ static int regexp_exec(lua_State *lstate) return 1; } - -static int regexp_test(lua_State *lstate) -{ +static int regexp_test(lua_State *lstate) { if (lua_gettop(lstate) != 2) { return luaL_error(lstate, "expecting exactly 2 arguments"); } @@ -484,10 +475,8 @@ static int regexp_test(lua_State *lstate) return 1; } - // more gettable fields to be added here -static int regexp_index(lua_State *lstate) -{ +static int regexp_index(lua_State *lstate) { struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT); luaL_getmetatable(lstate, JSREGEXP_MT); @@ -521,10 +510,8 @@ static int regexp_index(lua_State *lstate) return 1; } - // only last_index should be settable -static int regexp_newindex(lua_State *lstate) -{ +static int regexp_newindex(lua_State *lstate) { struct regexp *r = luaL_checkudata(lstate, 1, JSREGEXP_MT); const char *key = lua_tostring(lstate, 2); @@ -539,20 +526,16 @@ static int regexp_newindex(lua_State *lstate) return 0; } -static struct luaL_Reg jsregexp_meta[] = { - {"exec", regexp_exec}, - {"test", regexp_test}, - {"__gc", regexp_gc}, - {"__call", regexp_call}, - {"__tostring", regexp_tostring}, - {"__index", regexp_index}, - {"__newindex", regexp_newindex}, - {NULL, NULL} -}; - +static struct luaL_Reg jsregexp_meta[] = {{"exec", regexp_exec}, + {"test", regexp_test}, + {"__gc", regexp_gc}, + {"__call", regexp_call}, + {"__tostring", regexp_tostring}, + {"__index", regexp_index}, + {"__newindex", regexp_newindex}, + {NULL, NULL}}; -static int jsregexp_compile(lua_State *lstate) -{ +static int jsregexp_compile(lua_State *lstate) { char error_msg[64]; int len, re_flags = 0; @@ -573,20 +556,34 @@ static int jsregexp_compile(lua_State *lstate) const char *flags = luaL_checkstring(lstate, 2); while (*flags) { switch (*(flags++)) { - case 'i': re_flags |= LRE_FLAG_IGNORECASE; break; - case 'g': re_flags |= LRE_FLAG_GLOBAL; break; - case 'm': re_flags |= LRE_FLAG_MULTILINE; break; - case 'n': re_flags |= LRE_FLAG_NAMED_GROUPS; break; - case 's': re_flags |= LRE_FLAG_DOTALL; break; - case 'u': re_flags |= LRE_FLAG_UTF16; break; - case 'y': re_flags |= LRE_FLAG_STICKY; break; - default: /* unknown flag */; + case 'i': + re_flags |= LRE_FLAG_IGNORECASE; + break; + case 'g': + re_flags |= LRE_FLAG_GLOBAL; + break; + case 'm': + re_flags |= LRE_FLAG_MULTILINE; + break; + case 'n': + re_flags |= LRE_FLAG_NAMED_GROUPS; + break; + case 's': + re_flags |= LRE_FLAG_DOTALL; + break; + case 'u': + re_flags |= LRE_FLAG_UTF16; + break; + case 'y': + re_flags |= LRE_FLAG_STICKY; + break; + default: /* unknown flag */; } } } uint8_t *bc = lre_compile(&len, error_msg, sizeof error_msg, regexp, - strlen(regexp), re_flags, NULL); + strlen(regexp), re_flags, NULL); if (!bc) { luaL_argerror(lstate, 1, error_msg); @@ -619,16 +616,13 @@ static int jsregexp_compile_safe(lua_State *lstate) { } } - static const struct luaL_Reg jsregexp_lib[] = { - {"compile", jsregexp_compile}, - {"compile_safe", jsregexp_compile_safe}, - {"to_jsstring", jsstring_new}, - {NULL, NULL} -}; + {"compile", jsregexp_compile}, + {"compile_safe", jsregexp_compile_safe}, + {"to_jsstring", jsstring_new}, + {NULL, NULL}}; -int luaopen_jsregexp_core(lua_State *lstate) -{ +int luaopen_jsregexp_core(lua_State *lstate) { luaL_newmetatable(lstate, JSREGEXP_MATCH_MT); lua_pushcfunction(lstate, match_tostring); lua_setfield(lstate, -2, "__tostring"); diff --git a/jsregexp.lua b/jsregexp.lua index bfe4275..6af94b2 100644 --- a/jsregexp.lua +++ b/jsregexp.lua @@ -1,70 +1,92 @@ -local jsregexp = require "jsregexp.core" +local jsregexp = require("jsregexp.core") setmetatable(jsregexp, { - __call = function(self, expr, flags) return jsregexp.compile(expr, flags) end + __call = function(self, expr, flags) + return jsregexp.compile(expr, flags) + end, }) function jsregexp.mt.match(re, str) - local jstr = jsregexp.to_jsstring(str) - if not re.global then return re:exec(jstr) end - local matches = {} - local val - - re.last_index = 1 - - while true do - val = re:exec(jstr) - if val == nil then break end - table.insert(matches, val) - if #val[0] == 0 then re.last_index = re.last_index + 1 end - end - if #matches == 0 then return nil end - return matches + local jstr = jsregexp.to_jsstring(str) + if not re.global then + return re:exec(jstr) + end + local matches = {} + local val + + re.last_index = 1 + + while true do + val = re:exec(jstr) + if val == nil then + break + end + table.insert(matches, val) + if #val[0] == 0 then + re.last_index = re.last_index + 1 + end + end + if #matches == 0 then + return nil + end + return matches end function jsregexp.mt.match_all(re, str) - -- must duplicate (according to string.proptype.matchAll spec) - local re2 = jsregexp.compile(re.source, re.flags) - local jstr = jsregexp.to_jsstring(str) - return function() return re2:exec(jstr) end + -- must duplicate (according to string.proptype.matchAll spec) + local re2 = jsregexp.compile(re.source, re.flags) + local jstr = jsregexp.to_jsstring(str) + return function() + return re2:exec(jstr) + end end function jsregexp.mt.match_all_list(re, str) - local matches = {} - for match in jsregexp.mt.match_all(re, str) do table.insert(matches, match) end - return matches + local matches = {} + for match in jsregexp.mt.match_all(re, str) do + table.insert(matches, match) + end + return matches end function jsregexp.mt.search(re, str) - -- spec says to start at 1 and restore last_index - local prev_last_index = re.last_index - re.last_index = 1 - local match = re:exec(str) - re.last_index = prev_last_index - if match == nil then return -1 end - return match.index + -- spec says to start at 1 and restore last_index + local prev_last_index = re.last_index + re.last_index = 1 + local match = re:exec(str) + re.last_index = prev_last_index + if match == nil then + return -1 + end + return match.index end function jsregexp.mt.split(re, str, limit) - if limit == nil then limit = math.huge end - if limit == 0 then return {} end - assert(limit >= 0, "limit must be non-negative") + if limit == nil then + limit = math.huge + end + if limit == 0 then + return {} + end + assert(limit >= 0, "limit must be non-negative") - local jstr = jsregexp.to_jsstring(str) - local re2 = jsregexp.compile(re.source, re.flags .. "y") -- add sticky + local jstr = jsregexp.to_jsstring(str) + local re2 = jsregexp.compile(re.source, re.flags .. "y") -- add sticky - local count = 0 - local split = {} + local count = 0 + local split = {} local prev_index = 1 - while count < limit do + while count < limit do local li = re2.last_index - local match = re2:exec(jstr) + local match = re2:exec(jstr) if match then if #str == 0 then break end local sub = string.sub(str, prev_index, match.index - 1) - if #sub > 0 or #match[0] > 0 then table.insert(split, sub) end + if #sub > 0 or #match[0] > 0 then + table.insert(split, sub) + end for _, group in ipairs(match) do if count < limit then table.insert(split, group) @@ -83,105 +105,109 @@ function jsregexp.mt.split(re, str, limit) table.insert(split, sub) break end - end - return split + end + return split end local function is_digit(c, i) - local b = string.byte(c, i, i + 1) - return b >= string.byte('0') and b <= string.byte('9') + local b = string.byte(c, i, i + 1) + return b >= string.byte("0") and b <= string.byte("9") end local function get_substitution(match, str, replacement) - local result = {} - - local i = 1 - local repl_len = #replacement - - while true do - local j = string.find(replacement, "$", i, true) - if j == nil or j + 1 > repl_len then break end - table.insert(result, string.sub(replacement, i, j - 1)) - local j0 = j - local c = string.sub(replacement, j + 1, j + 1) - j = j + 2 - if c == '$' then - table.insert(result, "$") - elseif c == '&' then - table.insert(result, match[0]) - elseif c == '`' then - table.insert(result, string.sub(str, 1, match.index)) - elseif c == '\'' then - table.insert(result, string.sub(str, match.index + #match[0])) - elseif is_digit(c, 1) then - local k = c - local kv - local dig2 = false - if j <= repl_len and is_digit(replacement, j) then - k = k .. string.sub(replacement, j, j) - dig2 = true - end - local kv1 = tonumber(k) - assert(kv1 ~= nil) - - -- This behavior is specified in ES6 and refined in ECMA 2019 - if dig2 and kv1 >= 1 and match[kv1] ~= nil then - kv = kv1 - j = j + 1 - else - kv = tonumber(k) - assert(kv ~= nil) - end - if kv >= 1 and match[kv] ~= nil then - table.insert(result, match[kv]) - else - table.insert(result, string.sub(replacement, j0, j)) - end - elseif c == '<' and match.groups ~= nil then - local k = string.find(replacement, ">", j, true) - if k == nil then - table.insert(result, string.sub(replacement, j0, j)) - else - local name = string.sub(replacement, j, k - 1) - local capture = match.groups[name] - assert(capture ~= nil, "invalid capture name: " .. name) - table.insert(result, capture) - j = k + 1 - end - else - table.insert(result, string.sub(replacement, j0, j)) - end - - i = j - end - table.insert(result, string.sub(replacement, i)) - return table.concat(result) + local result = {} + + local i = 1 + local repl_len = #replacement + + while true do + local j = string.find(replacement, "$", i, true) + if j == nil or j + 1 > repl_len then + break + end + table.insert(result, string.sub(replacement, i, j - 1)) + local j0 = j + local c = string.sub(replacement, j + 1, j + 1) + j = j + 2 + if c == "$" then + table.insert(result, "$") + elseif c == "&" then + table.insert(result, match[0]) + elseif c == "`" then + table.insert(result, string.sub(str, 1, match.index)) + elseif c == "'" then + table.insert(result, string.sub(str, match.index + #match[0])) + elseif is_digit(c, 1) then + local k = c + local kv + local dig2 = false + if j <= repl_len and is_digit(replacement, j) then + k = k .. string.sub(replacement, j, j) + dig2 = true + end + local kv1 = tonumber(k) + assert(kv1 ~= nil) + + -- This behavior is specified in ES6 and refined in ECMA 2019 + if dig2 and kv1 >= 1 and match[kv1] ~= nil then + kv = kv1 + j = j + 1 + else + kv = tonumber(k) + assert(kv ~= nil) + end + if kv >= 1 and match[kv] ~= nil then + table.insert(result, match[kv]) + else + table.insert(result, string.sub(replacement, j0, j)) + end + elseif c == "<" and match.groups ~= nil then + local k = string.find(replacement, ">", j, true) + if k == nil then + table.insert(result, string.sub(replacement, j0, j)) + else + local name = string.sub(replacement, j, k - 1) + local capture = match.groups[name] + assert(capture ~= nil, "invalid capture name: " .. name) + table.insert(result, capture) + j = k + 1 + end + else + table.insert(result, string.sub(replacement, j0, j)) + end + + i = j + end + table.insert(result, string.sub(replacement, i)) + return table.concat(result) end function jsregexp.mt.replace_all(re, str, replacement) - local jstr = jsregexp.to_jsstring(str) - - re.last_index = 1 - - local output = {} - - local prev_index = 1 - local cur_index = 1 - while true do - prev_index = re.last_index - local match = re:exec(jstr) - if match == nil then break end - cur_index = re.last_index - - table.insert(output, string.sub(str, prev_index, match.index - 1)) - if type(replacement) == "function" then - table.insert(output, replacement(match, str)) - else - table.insert(output, get_substitution(match, str, replacement)) - end - end - table.insert(output, string.sub(str, cur_index)) - return table.concat(output) + local jstr = jsregexp.to_jsstring(str) + + re.last_index = 1 + + local output = {} + + local prev_index = 1 + local cur_index = 1 + while true do + prev_index = re.last_index + local match = re:exec(jstr) + if match == nil then + break + end + cur_index = re.last_index + + table.insert(output, string.sub(str, prev_index, match.index - 1)) + if type(replacement) == "function" then + table.insert(output, replacement(match, str)) + else + table.insert(output, get_substitution(match, str, replacement)) + end + end + table.insert(output, string.sub(str, cur_index)) + return table.concat(output) end function jsregexp.mt.replace(re, str, replacement) @@ -189,11 +215,11 @@ function jsregexp.mt.replace(re, str, replacement) return jsregexp.mt.replace_all(re, str, replacement) end - local jstr = jsregexp.to_jsstring(str) + local jstr = jsregexp.to_jsstring(str) - re.last_index = 1 + re.last_index = 1 - local output = {} + local output = {} local match = re:exec(jstr) if match then diff --git a/test.lua b/test.lua index 9fdd86c..7786b9f 100644 --- a/test.lua +++ b/test.lua @@ -65,10 +65,17 @@ local function test_call(str, regex, flags, want) fails = fails + 1 return end - for k,v in pairs(want.named_groups) do + for k, v in pairs(want.named_groups) do if val.named_groups[k] ~= v then fails = fails + 1 - print(string.format("named group mismatch group '%s': expected '%s', actual '%s'", k, v, val.named_groups[k])) + print( + string.format( + "named group mismatch group '%s': expected '%s', actual '%s'", + k, + v, + val.named_groups[k] + ) + ) return end end @@ -117,7 +124,9 @@ local function test_exec(str, regex, flags, want) if match_wanted.groups then for key, val in pairs(match_wanted.groups) do if val ~= match.groups[key] then - return fail(string.format("named group %s mismatch, wanted %s, got %s", key, val, match.groups[key])) + return fail( + string.format("named group %s mismatch, wanted %s, got %s", key, val, match.groups[key]) + ) end end end @@ -249,7 +258,7 @@ local function test_split(str, regex, flags, want) end local split = r:split(str) local min = math.min(#split, #want) - for i = 1,min do + for i = 1, min do local w = want[i] if w ~= split[i] then return fail("split mismatch, wanted %s, got %s", w, split[i]) @@ -286,95 +295,174 @@ test_compile("dummy", "[", "", nil) -- (luajit at least..) test_compile("dummy", string.char(0xfd, 166, 178, 165, 138, 183), "", nil) -test_call("dummy", ".", "", {{"d"}}) -test_call("du", ".", "g", {{"d"}, {"u"}}) +test_call("dummy", ".", "", { { "d" } }) +test_call("du", ".", "g", { { "d" }, { "u" } }) test_call("dummy", "c", "", {}) test_call("dummy", "c", "g", {}) -test_call("dummy", "d", "", {{"d"}}) -test_call("dummy", "m", "", {{"m"}}) -test_call("dummy", "m", "g", {{"m"}, {"m"}}) - -test_call("dummy", "(dummy)", "", {{"dummy", groups = {"dummy"}}}) -test_call("The quick brown fox jumps over the lazy dog", "\\w+", "", {{"The"}}) -test_call("The quick brown fox jumps over the lazy dog", "\\w+", "g", {{"The"}, {"quick"}, {"brown"}, {"fox"}, {"jumps"}, {"over"}, {"the"}, {"lazy"}, {"dog"}}) -test_call("The quick brown fox jumps over the lazy dog", "[aeiou]{2,}", "g", {{"ui"}}) - -test_call("äöü", ".", "g", {{"ä"}, {"ö"}, {"ü"}}) -test_call("äöü", ".", "", {{"ä"}}) -test_call("ÄÖÜ", ".", "", {{"Ä"}}) -test_call("äöü", "[äöü]", "g", {{"ä"}, {"ö"}, {"ü"}}) -test_call("äöü", "[äöü]*", "g", {{"äöü"}, {""}}) -test_call("äÄ", "ä", "gi", {{"ä"}, {"Ä"}}) -test_call("öäü.haha", "([^.]*)\\.(.*)", "", {{"öäü.haha", groups={"öäü", "haha"}}}) - -test_call("𝄞", "𝄞", "", {{"𝄞"}}) +test_call("dummy", "d", "", { { "d" } }) +test_call("dummy", "m", "", { { "m" } }) +test_call("dummy", "m", "g", { { "m" }, { "m" } }) + +test_call("dummy", "(dummy)", "", { { "dummy", groups = { "dummy" } } }) +test_call("The quick brown fox jumps over the lazy dog", "\\w+", "", { { "The" } }) +test_call( + "The quick brown fox jumps over the lazy dog", + "\\w+", + "g", + { { "The" }, { "quick" }, { "brown" }, { "fox" }, { "jumps" }, { "over" }, { "the" }, { "lazy" }, { "dog" } } +) +test_call("The quick brown fox jumps over the lazy dog", "[aeiou]{2,}", "g", { { "ui" } }) + +test_call("äöü", ".", "g", { { "ä" }, { "ö" }, { "ü" } }) +test_call("äöü", ".", "", { { "ä" } }) +test_call("ÄÖÜ", ".", "", { { "Ä" } }) +test_call("äöü", "[äöü]", "g", { { "ä" }, { "ö" }, { "ü" } }) +test_call("äöü", "[äöü]*", "g", { { "äöü" }, { "" } }) +test_call("äÄ", "ä", "gi", { { "ä" }, { "Ä" } }) +test_call("öäü.haha", "([^.]*)\\.(.*)", "", { { "öäü.haha", groups = { "öäü", "haha" } } }) + +test_call("𝄞", "𝄞", "", { { "𝄞" } }) -- these empty matches are expected and consistent with vscode -test_call("öö öö", "ö*", "g", {{"öö"}, {""}, {"öö"}, {""}}) -test_call("𝄞𝄞 𝄞𝄞", "[^ ]*", "g", {{"𝄞𝄞"}, {""}, {"𝄞𝄞"}, {""}}) -test_call("𝄞𝄞", "𝄞*", "", {{"𝄞𝄞"}}) +test_call("öö öö", "ö*", "g", { { "öö" }, { "" }, { "öö" }, { "" } }) +test_call("𝄞𝄞 𝄞𝄞", "[^ ]*", "g", { { "𝄞𝄞" }, { "" }, { "𝄞𝄞" }, { "" } }) +test_call("𝄞𝄞", "𝄞*", "", { { "𝄞𝄞" } }) -- doesn't work in vscode, matches only a single 𝄞 each time: -test_call("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", {{"𝄞𝄞"}, {""}, {"𝄞𝄞"}, {""}}) +test_call("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { { "𝄞𝄞" }, { "" }, { "𝄞𝄞" }, { "" } }) -- vscode actually splits the center unicode character and produces an extra empty match. we don't. -test_call("öö𐐷öö", "ö*", "g", {{"öö"}, {""}, {"öö"}, {""}}) -test_call("a", "𝄞|a", "g", {{"a"}}) -- utf16 regex, ascii input +test_call("öö𐐷öö", "ö*", "g", { { "öö" }, { "" }, { "öö" }, { "" } }) +test_call("a", "𝄞|a", "g", { { "a" } }) -- utf16 regex, ascii input -test_call("κόσμε", "(κόσμε)", "", {{"κόσμε", groups={"κόσμε"}}}) +test_call("κόσμε", "(κόσμε)", "", { { "κόσμε", groups = { "κόσμε" } } }) -test_call("jordbær fløde på", "(jordbær fløde på)", "", {{"jordbær fløde på", groups={"jordbær fløde på"}}}) +test_call( + "jordbær fløde på", + "(jordbær fløde på)", + "", + { { "jordbær fløde på", groups = { "jordbær fløde på" } } } +) -test_call("Heizölrückstoßabdämpfung", "(Heizölrückstoßabdämpfung)", "", {{"Heizölrückstoßabdämpfung", groups={"Heizölrückstoßabdämpfung"}}}) +test_call( + "Heizölrückstoßabdämpfung", + "(Heizölrückstoßabdämpfung)", + "", + { { "Heizölrückstoßabdämpfung", groups = { "Heizölrückstoßabdämpfung" } } } +) -test_call("Fête l'haï volapük", "(Fête l'haï volapük)", "", {{"Fête l'haï volapük", groups={"Fête l'haï volapük"}}}) +test_call( + "Fête l'haï volapük", + "(Fête l'haï volapük)", + "", + { { "Fête l'haï volapük", groups = { "Fête l'haï volapük" } } } +) -test_call("Árvíztűrő tükörfúrógép", "(Árvíztűrő tükörfúrógép)", "", {{"Árvíztűrő tükörfúrógép", groups={"Árvíztűrő tükörfúrógép"}}}) +test_call( + "Árvíztűrő tükörfúrógép", + "(Árvíztűrő tükörfúrógép)", + "", + { { "Árvíztűrő tükörfúrógép", groups = { "Árvíztűrő tükörfúrógép" } } } +) -test_call("いろはにほへとちりぬるを", "(いろはにほへとちりぬるを)", "", {{"いろはにほへとちりぬるを", groups={"いろはにほへとちりぬるを"}}}) +test_call( + "いろはにほへとちりぬるを", + "(いろはにほへとちりぬるを)", + "", + { { "いろはにほへとちりぬるを", groups = { "いろはにほへとちりぬるを" } } } +) -test_call("Съешь же ещё этих мягких французских булок да выпей чаю", "(Съешь же ещё этих мягких французских булок да выпей чаю)", "", {{"Съешь же ещё этих мягких французских булок да выпей чаю", groups={"Съешь же ещё этих мягких французских булок да выпей чаю"}}}) +test_call( + "Съешь же ещё этих мягких французских булок да выпей чаю", + "(Съешь же ещё этих мягких французских булок да выпей чаю)", + "", + { + { + "Съешь же ещё этих мягких французских булок да выпей чаю", + groups = { + "Съешь же ещё этих мягких французских булок да выпей чаю", + }, + }, + } +) -- no idea how thai works -- test("จงฝ่าฟันพัฒนาวิชาการ", "(จงฝ่าฟันพัฒนาวิชาการ)", "", {{"จงฝ่าฟันพัฒนาวิชาการ", groups="จงฝ่าฟันพัฒนาวิชาการ"}}) - -- named groups: -test_call("The quick brown fox jumps over the lazy dog", "(?\\w+) (\\w+) (?\\w+)", "n", -{{"The quick brown", groups={"The", "quick", "brown"}, named_groups={first_word="The", third_word="brown"}}} +test_call( + "The quick brown fox jumps over the lazy dog", + "(?\\w+) (\\w+) (?\\w+)", + "n", + { { "The quick brown", groups = { "The", "quick", "brown" }, named_groups = { first_word = "The", third_word = "brown" } } } ) -test_call("The qüick bröwn föx jümps över the lazy dög", "(?[^ ]+) ([^ ]+) (?[^ ]+)", "n", -{{"The qüick bröwn", groups={"The", "qüick", "bröwn"}, named_groups={first_word="The", third_word="bröwn"}}} +test_call( + "The qüick bröwn föx jümps över the lazy dög", + "(?[^ ]+) ([^ ]+) (?[^ ]+)", + "n", + { { "The qüick bröwn", groups = { "The", "qüick", "bröwn" }, named_groups = { + first_word = "The", + third_word = "bröwn", + } } } ) -test_call("The quick bröwn föx", "(?[^ ]+) ([^ ]+) (?[^ ]+)", "n", -{{"The quick bröwn", groups={"The", "quick", "bröwn"}, named_groups={["first_wörd"]="The", ["third_wörd"]="bröwn"}}} +test_call( + "The quick bröwn föx", + "(?[^ ]+) ([^ ]+) (?[^ ]+)", + "n", + { + { + "The quick bröwn", + groups = { "The", "quick", "bröwn" }, + named_groups = { ["first_wörd"] = "The", ["third_wörd"] = "bröwn" }, + }, + } +) +test_call( + "𝄞𝄞 𐐷", + "(?[^ ]+)", + "ng", + { + { "𝄞𝄞", groups = { "𝄞𝄞" }, named_groups = { word = "𝄞𝄞" } }, + { "𐐷", groups = { "𐐷" }, named_groups = { word = "𐐷" } }, + } ) -test_call("𝄞𝄞 𐐷", "(?[^ ]+)", "ng", {{"𝄞𝄞", groups={"𝄞𝄞"}, named_groups={word="𝄞𝄞"}}, {"𐐷", groups={"𐐷"}, named_groups={word="𐐷"}}}) -test_exec("The quick brown", "\\w+", "g", {{[0]="The"}, {[0]="quick"}, {[0]="brown"}}) -test_exec("The quick brown fox", "(\\w+) (\\w+)", "g", {{[0]="The quick", "The", "quick"}, {[0]="brown fox", "brown", "fox"}}) -test_exec("The quick brown fox", "(?\\w+) (\\w+)", "g", -{{[0]="The quick", "The", "quick", groups={word1="The"}}, {[0]="brown fox", "brown", "fox", groups={word1="brown"}}}) +test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } }) +test_exec( + "The quick brown fox", + "(\\w+) (\\w+)", + "g", + { { [0] = "The quick", "The", "quick" }, { [0] = "brown fox", "brown", "fox" } } +) +test_exec( + "The quick brown fox", + "(?\\w+) (\\w+)", + "g", + { + { [0] = "The quick", "The", "quick", groups = { word1 = "The" } }, + { [0] = "brown fox", "brown", "fox", groups = { word1 = "brown" } }, + } +) -test_test("The quick brown", "\\w+", "", {true}) -test_test("The quick brown", "\\d+", "", {false}) -test_test("The quick brown", "\\w+", "g", {true, true, true}) +test_test("The quick brown", "\\w+", "", { true }) +test_test("The quick brown", "\\d+", "", { false }) +test_test("The quick brown", "\\w+", "g", { true, true, true }) test_match("The quick brown", "\\d+", "g", nil) -test_match("The quick brown", "\\w+", "g", {"The", "quick", "brown"}) +test_match("The quick brown", "\\w+", "g", { "The", "quick", "brown" }) test_match_all_list("The quick brown", "\\d+", "g", {}) -test_match_all_list("The quick brown", "\\w+", "g", {"The", "quick", "brown"}) +test_match_all_list("The quick brown", "\\w+", "g", { "The", "quick", "brown" }) test_search("The quick brown", "nothing", "g", -1) test_search("The quick brown", "quick", "g", 5) -test_split("abc", "x", "g", {"abc"}) +test_split("abc", "x", "g", { "abc" }) test_split("", "a?", "g", {}) -test_split("", "a", "g", {""}) -test_split("1-2-3", "-", "g", {"1", "2", "3"}) -test_split("1-2-", "-", "g", {"1", "2", ""}) -test_split("-2-3", "-", "g", {"", "2", "3"}) -test_split("--", "-", "g", {"", "", ""}) -test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", {"Hello ", "1", " word. Sentence number ", "2", "."}) +test_split("", "a", "g", { "" }) +test_split("1-2-3", "-", "g", { "1", "2", "3" }) +test_split("1-2-", "-", "g", { "1", "2", "" }) +test_split("-2-3", "-", "g", { "", "2", "3" }) +test_split("--", "-", "g", { "", "", "" }) +test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", { "Hello ", "1", " word. Sentence number ", "2", "." }) test_replace("a1b2c", "X", "g", "_", "a1b2c") test_replace("a1b2c", "\\d", "", "_", "a_b2c") From da4d4c036d5c5217ffdfb12fafcb6298a741cc7e Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:18:28 +0200 Subject: [PATCH 4/8] update libregexp --- jsregexp.c | 16 +- libregexp/cutils.c | 6 +- libregexp/cutils.h | 72 +- libregexp/libregexp-opcode.h | 5 +- libregexp/libregexp.c | 4393 +++++++++++++++++----------------- libregexp/libregexp.h | 49 +- libregexp/libunicode-table.h | 285 ++- libregexp/libunicode.c | 564 ++++- libregexp/libunicode.h | 102 +- 9 files changed, 2980 insertions(+), 2512 deletions(-) diff --git a/jsregexp.c b/jsregexp.c index 305256d..e01d2be 100644 --- a/jsregexp.c +++ b/jsregexp.c @@ -31,6 +31,14 @@ #define streq(X, Y) ((*(X) == *(Y)) && strcmp(X, Y) == 0) +// these two functions need to be defined for libregexp +void *lre_realloc(void *opaque, void *ptr, size_t size) { + return realloc(ptr, size); +} +BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) { + return FALSE; +} + struct regexp { char *expr; uint8_t *bc; @@ -332,7 +340,7 @@ static void regexp_pushflags(lua_State *lstate, const struct regexp *r) { const char *multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : ""; const char *named_groups = (flags & LRE_FLAG_NAMED_GROUPS) ? "n" : ""; const char *dotall = (flags & LRE_FLAG_DOTALL) ? "s" : ""; - const char *utf16 = (flags & LRE_FLAG_UTF16) ? "u" : ""; + const char *utf16 = (flags & LRE_FLAG_UNICODE) ? "u" : ""; const char *sticky = (flags & LRE_FLAG_STICKY) ? "y" : ""; lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline, named_groups, dotall, utf16, sticky); @@ -498,7 +506,7 @@ static int regexp_index(lua_State *lstate) { } else if (streq(key, "sticky")) { lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_STICKY); } else if (streq(key, "unicode")) { - lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_UTF16); + lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_UNICODE); } else if (streq(key, "source")) { lua_pushstring(lstate, r->expr); } else if (streq(key, "flags")) { @@ -549,7 +557,7 @@ static int jsregexp_compile(lua_State *lstate) { if (utf8_contains_non_bmp(regexp)) { // bmp range works fine without utf16 flag - re_flags |= LRE_FLAG_UTF16; + re_flags |= LRE_FLAG_UNICODE; } if (!lua_isnoneornil(lstate, 2)) { @@ -572,7 +580,7 @@ static int jsregexp_compile(lua_State *lstate) { re_flags |= LRE_FLAG_DOTALL; break; case 'u': - re_flags |= LRE_FLAG_UTF16; + re_flags |= LRE_FLAG_UNICODE; break; case 'y': re_flags |= LRE_FLAG_STICKY; diff --git a/libregexp/cutils.c b/libregexp/cutils.c index a02fb76..c0aacef 100644 --- a/libregexp/cutils.c +++ b/libregexp/cutils.c @@ -1,6 +1,6 @@ /* * C utilities - * + * * Copyright (c) 2017 Fabrice Bellard * Copyright (c) 2018 Charlie Gordon * @@ -140,7 +140,7 @@ int dbuf_put(DynBuf *s, const uint8_t *data, size_t len) if (dbuf_realloc(s, s->size + len)) return -1; } - memcpy(s->buf + s->size, data, len); + memcpy_no_ub(s->buf + s->size, data, len); s->size += len; return 0; } @@ -172,7 +172,7 @@ int __attribute__((format(printf, 2, 3))) dbuf_printf(DynBuf *s, va_list ap; char buf[128]; int len; - + va_start(ap, fmt); len = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); diff --git a/libregexp/cutils.h b/libregexp/cutils.h index 31f7cd8..f079e5c 100644 --- a/libregexp/cutils.h +++ b/libregexp/cutils.h @@ -1,6 +1,6 @@ /* * C utilities - * + * * Copyright (c) 2017 Fabrice Bellard * Copyright (c) 2018 Charlie Gordon * @@ -26,11 +26,9 @@ #define CUTILS_H #include +#include #include -/* set if CPU is big endian */ -#undef WORDS_BIGENDIAN - #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define force_inline inline __attribute__((always_inline)) @@ -48,6 +46,16 @@ #ifndef countof #define countof(x) (sizeof(x) / sizeof((x)[0])) #endif +#ifndef container_of +/* return the pointer of type 'type *' containing 'ptr' as field 'member' */ +#define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member))) +#endif + +#if !defined(_MSC_VER) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define minimum_length(n) static n +#else +#define minimum_length(n) n +#endif typedef int BOOL; @@ -63,6 +71,12 @@ char *pstrcat(char *buf, int buf_size, const char *s); int strstart(const char *str, const char *val, const char **ptr); int has_suffix(const char *str, const char *suffix); +/* Prevent UB when n == 0 and (src == NULL or dest == NULL) */ +static inline void memcpy_no_ub(void *dest, const void *src, size_t n) { + if (n) + memcpy(dest, src, n); +} + static inline int max_int(int a, int b) { if (a > b) @@ -207,28 +221,34 @@ static inline void put_u8(uint8_t *tab, uint8_t val) *tab = val; } +#ifndef bswap16 static inline uint16_t bswap16(uint16_t x) { return (x >> 8) | (x << 8); } +#endif +#ifndef bswap32 static inline uint32_t bswap32(uint32_t v) { return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >> 8) | ((v & 0x0000ff00) << 8) | ((v & 0x000000ff) << 24); } +#endif +#ifndef bswap64 static inline uint64_t bswap64(uint64_t v) { - return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) | - ((v & ((uint64_t)0xff << (6 * 8))) >> (5 * 8)) | - ((v & ((uint64_t)0xff << (5 * 8))) >> (3 * 8)) | - ((v & ((uint64_t)0xff << (4 * 8))) >> (1 * 8)) | - ((v & ((uint64_t)0xff << (3 * 8))) << (1 * 8)) | - ((v & ((uint64_t)0xff << (2 * 8))) << (3 * 8)) | - ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) | + return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) | + ((v & ((uint64_t)0xff << (6 * 8))) >> (5 * 8)) | + ((v & ((uint64_t)0xff << (5 * 8))) >> (3 * 8)) | + ((v & ((uint64_t)0xff << (4 * 8))) >> (1 * 8)) | + ((v & ((uint64_t)0xff << (3 * 8))) << (1 * 8)) | + ((v & ((uint64_t)0xff << (2 * 8))) << (3 * 8)) | + ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) | ((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8)); } +#endif /* XXX: should take an extra argument to pass slack information to the caller */ typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size); @@ -278,6 +298,36 @@ static inline void dbuf_set_error(DynBuf *s) int unicode_to_utf8(uint8_t *buf, unsigned int c); int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); +static inline BOOL is_surrogate(uint32_t c) +{ + return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF +} + +static inline BOOL is_hi_surrogate(uint32_t c) +{ + return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF +} + +static inline BOOL is_lo_surrogate(uint32_t c) +{ + return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF +} + +static inline uint32_t get_hi_surrogate(uint32_t c) +{ + return (c >> 10) - (0x10000 >> 10) + 0xD800; +} + +static inline uint32_t get_lo_surrogate(uint32_t c) +{ + return (c & 0x3FF) | 0xDC00; +} + +static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo) +{ + return 0x10000 + 0x400 * (hi - 0xD800) + (lo - 0xDC00); +} + static inline int from_hex(int c) { if (c >= '0' && c <= '9') diff --git a/libregexp/libregexp-opcode.h b/libregexp/libregexp-opcode.h index f90c23b..f255e09 100644 --- a/libregexp/libregexp-opcode.h +++ b/libregexp/libregexp-opcode.h @@ -1,6 +1,6 @@ /* * Regular Expression Engine - * + * * Copyright (c) 2017-2018 Fabrice Bellard * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -50,8 +50,7 @@ DEF(range32, 3) /* variable length */ DEF(lookahead, 5) DEF(negative_lookahead, 5) DEF(push_char_pos, 1) /* push the character position on the stack */ -DEF(bne_char_pos, 5) /* pop one stack element and jump if equal to the character - position */ +DEF(check_advance, 1) /* pop one stack element and check that it is different from the character position */ DEF(prev, 1) /* go to the previous char */ DEF(simple_greedy_quant, 17) diff --git a/libregexp/libregexp.c b/libregexp/libregexp.c index 9637aed..a2d56a7 100644 --- a/libregexp/libregexp.c +++ b/libregexp/libregexp.c @@ -21,22 +21,20 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include -#include -#include -#include #include +#include +#include +#include #include +#include #include "cutils.h" #include "libregexp.h" +#include "libunicode.h" /* TODO: - - Add full unicode canonicalize rules for character ranges (not - really useful but needed for exact "ignorecase" compatibility). - - Add a lock step execution mode (=linear time execution guaranteed) when the regular expression is "simple" i.e. no backreference nor complicated lookahead. The opcodes are designed for this execution @@ -48,417 +46,357 @@ #endif typedef enum { -#define DEF(id, size) REOP_##id, +#define DEF(id, size) REOP_ ## id, #include "libregexp-opcode.h" #undef DEF - REOP_COUNT, + REOP_COUNT, } REOPCodeEnum; #define CAPTURE_COUNT_MAX 255 #define STACK_SIZE_MAX 255 /* unicode code points */ -#define CP_LS 0x2028 -#define CP_PS 0x2029 +#define CP_LS 0x2028 +#define CP_PS 0x2029 #define TMP_BUF_SIZE 128 typedef struct { - DynBuf byte_code; - const uint8_t *buf_ptr; - const uint8_t *buf_end; - const uint8_t *buf_start; - int re_flags; - BOOL is_utf16; - BOOL ignore_case; - BOOL dotall; - int capture_count; - int total_capture_count; /* -1 = not computed yet */ - int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ - void *opaque; - DynBuf group_names; - union { - char error_msg[TMP_BUF_SIZE]; - char tmp_buf[TMP_BUF_SIZE]; - } u; + DynBuf byte_code; + const uint8_t *buf_ptr; + const uint8_t *buf_end; + const uint8_t *buf_start; + int re_flags; + BOOL is_unicode; + BOOL ignore_case; + BOOL dotall; + int capture_count; + int total_capture_count; /* -1 = not computed yet */ + int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ + void *opaque; + DynBuf group_names; + union { + char error_msg[TMP_BUF_SIZE]; + char tmp_buf[TMP_BUF_SIZE]; + } u; } REParseState; typedef struct { #ifdef DUMP_REOP - const char *name; + const char *name; #endif - uint8_t size; + uint8_t size; } REOpCode; static const REOpCode reopcode_info[REOP_COUNT] = { #ifdef DUMP_REOP -#define DEF(id, size) {#id, size}, +#define DEF(id, size) { #id, size }, #else -#define DEF(id, size) {size}, +#define DEF(id, size) { size }, #endif #include "libregexp-opcode.h" #undef DEF }; -#define RE_HEADER_FLAGS 0 +#define RE_HEADER_FLAGS 0 #define RE_HEADER_CAPTURE_COUNT 1 -#define RE_HEADER_STACK_SIZE 2 +#define RE_HEADER_STACK_SIZE 2 +#define RE_HEADER_BYTECODE_LEN 3 #define RE_HEADER_LEN 7 -static inline int is_digit(int c) { return c >= '0' && c <= '9'; } - -/* insert 'len' bytes at position 'pos'. Return < 0 if error. */ -static int dbuf_insert(DynBuf *s, int pos, int len) { - if (dbuf_realloc(s, s->size + len)) - return -1; - memmove(s->buf + pos + len, s->buf + pos, s->size - pos); - s->size += len; - return 0; +static inline int is_digit(int c) { + return c >= '0' && c <= '9'; } -/* canonicalize with the specific JS regexp rules */ -static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16) { - uint32_t res[LRE_CC_RES_LEN_MAX]; - int len; - if (is_utf16) { - if (likely(c < 128)) { - if (c >= 'A' && c <= 'Z') - c = c - 'A' + 'a'; - } else { - lre_case_conv(res, c, 2); - c = res[0]; - } - } else { - if (likely(c < 128)) { - if (c >= 'a' && c <= 'z') - c = c - 'a' + 'A'; - } else { - /* legacy regexp: to upper case if single char >= 128 */ - len = lre_case_conv(res, c, FALSE); - if (len == 1 && res[0] >= 128) - c = res[0]; - } - } - return c; +/* insert 'len' bytes at position 'pos'. Return < 0 if error. */ +static int dbuf_insert(DynBuf *s, int pos, int len) +{ + if (dbuf_realloc(s, s->size + len)) + return -1; + memmove(s->buf + pos + len, s->buf + pos, s->size - pos); + s->size += len; + return 0; } static const uint16_t char_range_d[] = { 1, - 0x0030, - 0x0039 + 1, + 0x0030, 0x0039 + 1, }; /* code point ranges for Zs,Zl or Zp property */ static const uint16_t char_range_s[] = { 10, - 0x0009, - 0x000D + 1, - 0x0020, - 0x0020 + 1, - 0x00A0, - 0x00A0 + 1, - 0x1680, - 0x1680 + 1, - 0x2000, - 0x200A + 1, + 0x0009, 0x000D + 1, + 0x0020, 0x0020 + 1, + 0x00A0, 0x00A0 + 1, + 0x1680, 0x1680 + 1, + 0x2000, 0x200A + 1, /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ - 0x2028, - 0x2029 + 1, - 0x202F, - 0x202F + 1, - 0x205F, - 0x205F + 1, - 0x3000, - 0x3000 + 1, + 0x2028, 0x2029 + 1, + 0x202F, 0x202F + 1, + 0x205F, 0x205F + 1, + 0x3000, 0x3000 + 1, /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ - 0xFEFF, - 0xFEFF + 1, + 0xFEFF, 0xFEFF + 1, }; -BOOL lre_is_space(int c) { - int i, n, low, high; - n = (countof(char_range_s) - 1) / 2; - for (i = 0; i < n; i++) { - low = char_range_s[2 * i + 1]; - if (c < low) - return FALSE; - high = char_range_s[2 * i + 2]; - if (c < high) - return TRUE; - } - return FALSE; -} - -uint32_t const lre_id_start_table_ascii[4] = { - /* $ A-Z _ a-z */ - 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE}; - -uint32_t const lre_id_continue_table_ascii[4] = { - /* $ 0-9 A-Z _ a-z */ - 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE}; - static const uint16_t char_range_w[] = { - 4, 0x0030, 0x0039 + 1, 0x0041, 0x005A + 1, - 0x005F, 0x005F + 1, 0x0061, 0x007A + 1, + 4, + 0x0030, 0x0039 + 1, + 0x0041, 0x005A + 1, + 0x005F, 0x005F + 1, + 0x0061, 0x007A + 1, }; #define CLASS_RANGE_BASE 0x40000000 typedef enum { - CHAR_RANGE_d, - CHAR_RANGE_D, - CHAR_RANGE_s, - CHAR_RANGE_S, - CHAR_RANGE_w, - CHAR_RANGE_W, + CHAR_RANGE_d, + CHAR_RANGE_D, + CHAR_RANGE_s, + CHAR_RANGE_S, + CHAR_RANGE_w, + CHAR_RANGE_W, } CharRangeEnum; -static const uint16_t *char_range_table[] = { +static const uint16_t * const char_range_table[] = { char_range_d, char_range_s, char_range_w, }; -static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) { - BOOL invert; - const uint16_t *c_pt; - int len, i; - - invert = c & 1; - c_pt = char_range_table[c >> 1]; - len = *c_pt++; - cr_init(cr, s->opaque, lre_realloc); - for (i = 0; i < len * 2; i++) { - if (cr_add_point(cr, c_pt[i])) - goto fail; - } - if (invert) { - if (cr_invert(cr)) - goto fail; - } - return 0; -fail: - cr_free(cr); - return -1; -} +static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) +{ + BOOL invert; + const uint16_t *c_pt; + int len, i; -static int cr_canonicalize(CharRange *cr) { - CharRange a; - uint32_t pt[2]; - int i, ret; - - cr_init(&a, cr->mem_opaque, lre_realloc); - pt[0] = 'a'; - pt[1] = 'z' + 1; - ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER); - if (ret) - goto fail; - /* convert to upper case */ - /* XXX: the generic unicode case would be much more complicated - and not really useful */ - for (i = 0; i < a.len; i++) { - a.points[i] += 'A' - 'a'; - } - /* Note: for simplicity we keep the lower case ranges */ - ret = cr_union1(cr, a.points, a.len); -fail: - cr_free(&a); - return ret; + invert = c & 1; + c_pt = char_range_table[c >> 1]; + len = *c_pt++; + cr_init(cr, s->opaque, lre_realloc); + for(i = 0; i < len * 2; i++) { + if (cr_add_point(cr, c_pt[i])) + goto fail; + } + if (invert) { + if (cr_invert(cr)) + goto fail; + } + return 0; + fail: + cr_free(cr); + return -1; } #ifdef DUMP_REOP -static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, int buf_len) { - int pos, len, opcode, bc_len, re_flags, i; - uint32_t val; - - assert(buf_len >= RE_HEADER_LEN); - - re_flags = buf[0]; - bc_len = get_u32(buf + 3); - assert(bc_len + RE_HEADER_LEN <= buf_len); - printf("flags: 0x%x capture_count=%d stack_size=%d\n", re_flags, buf[1], - buf[2]); - if (re_flags & LRE_FLAG_NAMED_GROUPS) { - const char *p; - p = (char *)buf + RE_HEADER_LEN + bc_len; - printf("named groups: "); - for (i = 1; i < buf[1]; i++) { - if (i != 1) - printf(","); - printf("<%s>", p); - p += strlen(p) + 1; - } - printf("\n"); - assert(p == (char *)(buf + buf_len)); - } - printf("bytecode_len=%d\n", bc_len); - - buf += RE_HEADER_LEN; - pos = 0; - while (pos < bc_len) { - printf("%5u: ", pos); - opcode = buf[pos]; - len = reopcode_info[opcode].size; - if (opcode >= REOP_COUNT) { - printf(" invalid opcode=0x%02x\n", opcode); - break; - } - if ((pos + len) > bc_len) { - printf(" buffer overflow (opcode=0x%02x)\n", opcode); - break; +static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, + int buf_len) +{ + int pos, len, opcode, bc_len, re_flags, i; + uint32_t val; + + assert(buf_len >= RE_HEADER_LEN); + + re_flags = lre_get_flags(buf); + bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN); + assert(bc_len + RE_HEADER_LEN <= buf_len); + printf("flags: 0x%x capture_count=%d stack_size=%d\n", + re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]); + if (re_flags & LRE_FLAG_NAMED_GROUPS) { + const char *p; + p = (char *)buf + RE_HEADER_LEN + bc_len; + printf("named groups: "); + for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) { + if (i != 1) + printf(","); + printf("<%s>", p); + p += strlen(p) + 1; + } + printf("\n"); + assert(p == (char *)(buf + buf_len)); } - printf("%s", reopcode_info[opcode].name); - switch (opcode) { - case REOP_char: - val = get_u16(buf + pos + 1); - if (val >= ' ' && val <= 126) - printf(" '%c'", val); - else - printf(" 0x%04x", val); - break; - case REOP_char32: - val = get_u32(buf + pos + 1); - if (val >= ' ' && val <= 126) - printf(" '%c'", val); - else - printf(" 0x%08x", val); - break; - case REOP_goto: - case REOP_split_goto_first: - case REOP_split_next_first: - case REOP_loop: - case REOP_lookahead: - case REOP_negative_lookahead: - case REOP_bne_char_pos: - val = get_u32(buf + pos + 1); - val += (pos + 5); - printf(" %u", val); - break; - case REOP_simple_greedy_quant: - printf(" %u %u %u %u", get_u32(buf + pos + 1) + (pos + 17), - get_u32(buf + pos + 1 + 4), get_u32(buf + pos + 1 + 8), - get_u32(buf + pos + 1 + 12)); - break; - case REOP_save_start: - case REOP_save_end: - case REOP_back_reference: - case REOP_backward_back_reference: - printf(" %u", buf[pos + 1]); - break; - case REOP_save_reset: - printf(" %u %u", buf[pos + 1], buf[pos + 2]); - break; - case REOP_push_i32: - val = get_u32(buf + pos + 1); - printf(" %d", val); - break; - case REOP_range: { - int n, i; - n = get_u16(buf + pos + 1); - len += n * 4; - for (i = 0; i < n * 2; i++) { - val = get_u16(buf + pos + 3 + i * 2); - printf(" 0x%04x", val); - } - } break; - case REOP_range32: { - int n, i; - n = get_u16(buf + pos + 1); - len += n * 8; - for (i = 0; i < n * 2; i++) { - val = get_u32(buf + pos + 3 + i * 4); - printf(" 0x%08x", val); - } - } break; - default: - break; + printf("bytecode_len=%d\n", bc_len); + + buf += RE_HEADER_LEN; + pos = 0; + while (pos < bc_len) { + printf("%5u: ", pos); + opcode = buf[pos]; + len = reopcode_info[opcode].size; + if (opcode >= REOP_COUNT) { + printf(" invalid opcode=0x%02x\n", opcode); + break; + } + if ((pos + len) > bc_len) { + printf(" buffer overflow (opcode=0x%02x)\n", opcode); + break; + } + printf("%s", reopcode_info[opcode].name); + switch(opcode) { + case REOP_char: + val = get_u16(buf + pos + 1); + if (val >= ' ' && val <= 126) + printf(" '%c'", val); + else + printf(" 0x%04x", val); + break; + case REOP_char32: + val = get_u32(buf + pos + 1); + if (val >= ' ' && val <= 126) + printf(" '%c'", val); + else + printf(" 0x%08x", val); + break; + case REOP_goto: + case REOP_split_goto_first: + case REOP_split_next_first: + case REOP_loop: + case REOP_lookahead: + case REOP_negative_lookahead: + val = get_u32(buf + pos + 1); + val += (pos + 5); + printf(" %u", val); + break; + case REOP_simple_greedy_quant: + printf(" %u %u %u %u", + get_u32(buf + pos + 1) + (pos + 17), + get_u32(buf + pos + 1 + 4), + get_u32(buf + pos + 1 + 8), + get_u32(buf + pos + 1 + 12)); + break; + case REOP_save_start: + case REOP_save_end: + case REOP_back_reference: + case REOP_backward_back_reference: + printf(" %u", buf[pos + 1]); + break; + case REOP_save_reset: + printf(" %u %u", buf[pos + 1], buf[pos + 2]); + break; + case REOP_push_i32: + val = get_u32(buf + pos + 1); + printf(" %d", val); + break; + case REOP_range: + { + int n, i; + n = get_u16(buf + pos + 1); + len += n * 4; + for(i = 0; i < n * 2; i++) { + val = get_u16(buf + pos + 3 + i * 2); + printf(" 0x%04x", val); + } + } + break; + case REOP_range32: + { + int n, i; + n = get_u16(buf + pos + 1); + len += n * 8; + for(i = 0; i < n * 2; i++) { + val = get_u32(buf + pos + 3 + i * 4); + printf(" 0x%08x", val); + } + } + break; + default: + break; + } + printf("\n"); + pos += len; } - printf("\n"); - pos += len; - } } #endif -static void re_emit_op(REParseState *s, int op) { - dbuf_putc(&s->byte_code, op); +static void re_emit_op(REParseState *s, int op) +{ + dbuf_putc(&s->byte_code, op); } /* return the offset of the u32 value */ -static int re_emit_op_u32(REParseState *s, int op, uint32_t val) { - int pos; - dbuf_putc(&s->byte_code, op); - pos = s->byte_code.size; - dbuf_put_u32(&s->byte_code, val); - return pos; +static int re_emit_op_u32(REParseState *s, int op, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val); + return pos; } -static int re_emit_goto(REParseState *s, int op, uint32_t val) { - int pos; - dbuf_putc(&s->byte_code, op); - pos = s->byte_code.size; - dbuf_put_u32(&s->byte_code, val - (pos + 4)); - return pos; +static int re_emit_goto(REParseState *s, int op, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val - (pos + 4)); + return pos; } -static void re_emit_op_u8(REParseState *s, int op, uint32_t val) { - dbuf_putc(&s->byte_code, op); - dbuf_putc(&s->byte_code, val); +static void re_emit_op_u8(REParseState *s, int op, uint32_t val) +{ + dbuf_putc(&s->byte_code, op); + dbuf_putc(&s->byte_code, val); } -static void re_emit_op_u16(REParseState *s, int op, uint32_t val) { - dbuf_putc(&s->byte_code, op); - dbuf_put_u16(&s->byte_code, val); +static void re_emit_op_u16(REParseState *s, int op, uint32_t val) +{ + dbuf_putc(&s->byte_code, op); + dbuf_put_u16(&s->byte_code, val); } -static int __attribute__((format(printf, 2, 3))) -re_parse_error(REParseState *s, const char *fmt, ...) { - va_list ap; - va_start(ap, fmt); - vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); - va_end(ap); - return -1; +static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); + va_end(ap); + return -1; } -static int re_parse_out_of_memory(REParseState *s) { - return re_parse_error(s, "out of memory"); +static int re_parse_out_of_memory(REParseState *s) +{ + return re_parse_error(s, "out of memory"); } /* If allow_overflow is false, return -1 in case of overflow. Otherwise return INT32_MAX. */ -static int parse_digits(const uint8_t **pp, BOOL allow_overflow) { - const uint8_t *p; - uint64_t v; - int c; - - p = *pp; - v = 0; - for (;;) { - c = *p; - if (c < '0' || c > '9') - break; - v = v * 10 + c - '0'; - if (v >= INT32_MAX) { - if (allow_overflow) - v = INT32_MAX; - else - return -1; +static int parse_digits(const uint8_t **pp, BOOL allow_overflow) +{ + const uint8_t *p; + uint64_t v; + int c; + + p = *pp; + v = 0; + for(;;) { + c = *p; + if (c < '0' || c > '9') + break; + v = v * 10 + c - '0'; + if (v >= INT32_MAX) { + if (allow_overflow) + v = INT32_MAX; + else + return -1; + } + p++; } - p++; - } - *pp = p; - return v; + *pp = p; + return v; } -static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) { - const uint8_t *p; - p = *pp; - if (*p != c) - return re_parse_error(s, "expecting '%c'", c); - p++; - *pp = p; - return 0; +static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) +{ + const uint8_t *p; + p = *pp; + if (*p != c) + return re_parse_error(s, "expecting '%c'", c); + p++; + *pp = p; + return 0; } /* Parse an escape sequence, *pp points after the '\': @@ -471,1312 +409,1307 @@ static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) { Return the unicode char and update *pp if recognized, return -1 if malformed escape, return -2 otherwise. */ -int lre_parse_escape(const uint8_t **pp, int allow_utf16) { - const uint8_t *p; - uint32_t c; - - p = *pp; - c = *p++; - switch (c) { - case 'b': - c = '\b'; - break; - case 'f': - c = '\f'; - break; - case 'n': - c = '\n'; - break; - case 'r': - c = '\r'; - break; - case 't': - c = '\t'; - break; - case 'v': - c = '\v'; - break; - case 'x': - case 'u': { - int h, n, i; - uint32_t c1; - - if (*p == '{' && allow_utf16) { - p++; - c = 0; - for (;;) { - h = from_hex(*p++); - if (h < 0) - return -1; - c = (c << 4) | h; - if (c > 0x10FFFF) - return -1; - if (*p == '}') - break; - } - p++; - } else { - if (c == 'x') { - n = 2; - } else { - n = 4; - } - - c = 0; - for (i = 0; i < n; i++) { - h = from_hex(*p++); - if (h < 0) { - return -1; - } - c = (c << 4) | h; - } - if (c >= 0xd800 && c < 0xdc00 && allow_utf16 == 2 && p[0] == '\\' && - p[1] == 'u') { - /* convert an escaped surrogate pair into a - unicode char */ - c1 = 0; - for (i = 0; i < 4; i++) { - h = from_hex(p[2 + i]); - if (h < 0) - break; - c1 = (c1 << 4) | h; - } - if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) { - p += 6; - c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; - } - } - } - } break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - c -= '0'; - if (allow_utf16 == 2) { - /* only accept \0 not followed by digit */ - if (c != 0 || is_digit(*p)) - return -1; - } else { - /* parse a legacy octal sequence */ - uint32_t v; - v = *p - '0'; - if (v > 7) +int lre_parse_escape(const uint8_t **pp, int allow_utf16) +{ + const uint8_t *p; + uint32_t c; + + p = *pp; + c = *p++; + switch(c) { + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; break; - c = (c << 3) | v; - p++; - if (c >= 32) + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + case 'x': + case 'u': + { + int h, n, i; + uint32_t c1; + + if (*p == '{' && allow_utf16) { + p++; + c = 0; + for(;;) { + h = from_hex(*p++); + if (h < 0) + return -1; + c = (c << 4) | h; + if (c > 0x10FFFF) + return -1; + if (*p == '}') + break; + } + p++; + } else { + if (c == 'x') { + n = 2; + } else { + n = 4; + } + + c = 0; + for(i = 0; i < n; i++) { + h = from_hex(*p++); + if (h < 0) { + return -1; + } + c = (c << 4) | h; + } + if (is_hi_surrogate(c) && + allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { + /* convert an escaped surrogate pair into a + unicode char */ + c1 = 0; + for(i = 0; i < 4; i++) { + h = from_hex(p[2 + i]); + if (h < 0) + break; + c1 = (c1 << 4) | h; + } + if (i == 4 && is_lo_surrogate(c1)) { + p += 6; + c = from_surrogate(c, c1); + } + } + } + } break; - v = *p - '0'; - if (v > 7) + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c -= '0'; + if (allow_utf16 == 2) { + /* only accept \0 not followed by digit */ + if (c != 0 || is_digit(*p)) + return -1; + } else { + /* parse a legacy octal sequence */ + uint32_t v; + v = *p - '0'; + if (v > 7) + break; + c = (c << 3) | v; + p++; + if (c >= 32) + break; + v = *p - '0'; + if (v > 7) + break; + c = (c << 3) | v; + p++; + } break; - c = (c << 3) | v; - p++; + default: + return -2; } - break; - default: - return -2; - } - *pp = p; - return c; + *pp = p; + return c; } #ifdef CONFIG_ALL_UNICODE /* XXX: we use the same chars for name and value */ -static BOOL is_unicode_char(int c) { - return ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || (c == '_')); +static BOOL is_unicode_char(int c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c == '_')); } static int parse_unicode_property(REParseState *s, CharRange *cr, - const uint8_t **pp, BOOL is_inv) { - const uint8_t *p; - char name[64], value[64]; - char *q; - BOOL script_ext; - int ret; - - p = *pp; - if (*p != '{') - return re_parse_error(s, "expecting '{' after \\p"); - p++; - q = name; - while (is_unicode_char(*p)) { - if ((q - name) >= sizeof(name) - 1) - goto unknown_property_name; - *q++ = *p++; - } - *q = '\0'; - q = value; - if (*p == '=') { + const uint8_t **pp, BOOL is_inv) +{ + const uint8_t *p; + char name[64], value[64]; + char *q; + BOOL script_ext; + int ret; + + p = *pp; + if (*p != '{') + return re_parse_error(s, "expecting '{' after \\p"); p++; + q = name; while (is_unicode_char(*p)) { - if ((q - value) >= sizeof(value) - 1) - return re_parse_error(s, "unknown unicode property value"); - *q++ = *p++; + if ((q - name) >= sizeof(name) - 1) + goto unknown_property_name; + *q++ = *p++; } - } - *q = '\0'; - if (*p != '}') - return re_parse_error(s, "expecting '}'"); - p++; - // printf("name=%s value=%s\n", name, value); - - if (!strcmp(name, "Script") || !strcmp(name, "sc")) { - script_ext = FALSE; - goto do_script; - } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { - script_ext = TRUE; - do_script: - cr_init(cr, s->opaque, lre_realloc); - ret = unicode_script(cr, value, script_ext); - if (ret) { - cr_free(cr); - if (ret == -2) - return re_parse_error(s, "unknown unicode script"); - else - goto out_of_memory; - } - } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { - cr_init(cr, s->opaque, lre_realloc); - ret = unicode_general_category(cr, value); - if (ret) { - cr_free(cr); - if (ret == -2) - return re_parse_error(s, "unknown unicode general category"); - else - goto out_of_memory; - } - } else if (value[0] == '\0') { - cr_init(cr, s->opaque, lre_realloc); - ret = unicode_general_category(cr, name); - if (ret == -1) { - cr_free(cr); - goto out_of_memory; + *q = '\0'; + q = value; + if (*p == '=') { + p++; + while (is_unicode_char(*p)) { + if ((q - value) >= sizeof(value) - 1) + return re_parse_error(s, "unknown unicode property value"); + *q++ = *p++; + } } - if (ret < 0) { - ret = unicode_prop(cr, name); - if (ret) { - cr_free(cr); - if (ret == -2) - goto unknown_property_name; - else - goto out_of_memory; - } + *q = '\0'; + if (*p != '}') + return re_parse_error(s, "expecting '}'"); + p++; + // printf("name=%s value=%s\n", name, value); + + if (!strcmp(name, "Script") || !strcmp(name, "sc")) { + script_ext = FALSE; + goto do_script; + } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { + script_ext = TRUE; + do_script: + cr_init(cr, s->opaque, lre_realloc); + ret = unicode_script(cr, value, script_ext); + if (ret) { + cr_free(cr); + if (ret == -2) + return re_parse_error(s, "unknown unicode script"); + else + goto out_of_memory; + } + } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { + cr_init(cr, s->opaque, lre_realloc); + ret = unicode_general_category(cr, value); + if (ret) { + cr_free(cr); + if (ret == -2) + return re_parse_error(s, "unknown unicode general category"); + else + goto out_of_memory; + } + } else if (value[0] == '\0') { + cr_init(cr, s->opaque, lre_realloc); + ret = unicode_general_category(cr, name); + if (ret == -1) { + cr_free(cr); + goto out_of_memory; + } + if (ret < 0) { + ret = unicode_prop(cr, name); + if (ret) { + cr_free(cr); + if (ret == -2) + goto unknown_property_name; + else + goto out_of_memory; + } + } + } else { + unknown_property_name: + return re_parse_error(s, "unknown unicode property name"); } - } else { - unknown_property_name: - return re_parse_error(s, "unknown unicode property name"); - } - - if (is_inv) { - if (cr_invert(cr)) { - cr_free(cr); - return -1; + + if (is_inv) { + if (cr_invert(cr)) { + cr_free(cr); + return -1; + } } - } - *pp = p; - return 0; -out_of_memory: - return re_parse_out_of_memory(s); + *pp = p; + return 0; + out_of_memory: + return re_parse_out_of_memory(s); } #endif /* CONFIG_ALL_UNICODE */ /* return -1 if error otherwise the character or a class range (CLASS_RANGE_BASE). In case of class range, 'cr' is initialized. Otherwise, it is ignored. */ -static int get_class_atom(REParseState *s, CharRange *cr, const uint8_t **pp, - BOOL inclass) { - const uint8_t *p; - uint32_t c; - int ret; +static int get_class_atom(REParseState *s, CharRange *cr, + const uint8_t **pp, BOOL inclass) +{ + const uint8_t *p; + uint32_t c; + int ret; - p = *pp; + p = *pp; - c = *p; - switch (c) { - case '\\': - p++; - if (p >= s->buf_end) - goto unexpected_end; - c = *p++; - switch (c) { - case 'd': - c = CHAR_RANGE_d; - goto class_range; - case 'D': - c = CHAR_RANGE_D; - goto class_range; - case 's': - c = CHAR_RANGE_s; - goto class_range; - case 'S': - c = CHAR_RANGE_S; - goto class_range; - case 'w': - c = CHAR_RANGE_w; - goto class_range; - case 'W': - c = CHAR_RANGE_W; - class_range: - if (cr_init_char_range(s, cr, c)) - return -1; - c = CLASS_RANGE_BASE; - break; - case 'c': - c = *p; - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (((c >= '0' && c <= '9') || c == '_') && inclass && - !s->is_utf16)) { /* Annex B.1.4 */ - c &= 0x1f; + c = *p; + switch(c) { + case '\\': p++; - } else if (s->is_utf16) { - goto invalid_escape; - } else { - /* otherwise return '\' and 'c' */ - p--; - c = '\\'; - } - break; + if (p >= s->buf_end) + goto unexpected_end; + c = *p++; + switch(c) { + case 'd': + c = CHAR_RANGE_d; + goto class_range; + case 'D': + c = CHAR_RANGE_D; + goto class_range; + case 's': + c = CHAR_RANGE_s; + goto class_range; + case 'S': + c = CHAR_RANGE_S; + goto class_range; + case 'w': + c = CHAR_RANGE_w; + goto class_range; + case 'W': + c = CHAR_RANGE_W; + class_range: + if (cr_init_char_range(s, cr, c)) + return -1; + c = CLASS_RANGE_BASE; + break; + case 'c': + c = *p; + if ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (((c >= '0' && c <= '9') || c == '_') && + inclass && !s->is_unicode)) { /* Annex B.1.4 */ + c &= 0x1f; + p++; + } else if (s->is_unicode) { + goto invalid_escape; + } else { + /* otherwise return '\' and 'c' */ + p--; + c = '\\'; + } + break; #ifdef CONFIG_ALL_UNICODE - case 'p': - case 'P': - if (s->is_utf16) { - if (parse_unicode_property(s, cr, &p, (c == 'P'))) - return -1; - c = CLASS_RANGE_BASE; - break; - } - /* fall thru */ + case 'p': + case 'P': + if (s->is_unicode) { + if (parse_unicode_property(s, cr, &p, (c == 'P'))) + return -1; + c = CLASS_RANGE_BASE; + break; + } + /* fall thru */ #endif + default: + p--; + ret = lre_parse_escape(&p, s->is_unicode * 2); + if (ret >= 0) { + c = ret; + } else { + if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { + /* always valid to escape these characters */ + goto normal_char; + } else if (s->is_unicode) { + invalid_escape: + return re_parse_error(s, "invalid escape sequence in regular expression"); + } else { + /* just ignore the '\' */ + goto normal_char; + } + } + break; + } + break; + case '\0': + if (p >= s->buf_end) { + unexpected_end: + return re_parse_error(s, "unexpected end"); + } + /* fall thru */ default: - p--; - ret = lre_parse_escape(&p, s->is_utf16 * 2); - if (ret >= 0) { - c = ret; - } else { - if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { - /* always valid to escape these characters */ - goto normal_char; - } else if (s->is_utf16) { - invalid_escape: - return re_parse_error( - s, "invalid escape sequence in regular expression"); + normal_char: + /* normal char */ + if (c >= 128) { + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + if ((unsigned)c > 0xffff && !s->is_unicode) { + /* XXX: should handle non BMP-1 code points */ + return re_parse_error(s, "malformed unicode char"); + } } else { - /* just ignore the '\' */ - goto normal_char; + p++; } - } - break; - } - break; - case '\0': - if (p >= s->buf_end) { - unexpected_end: - return re_parse_error(s, "unexpected end"); - } - /* fall thru */ - default: - normal_char: - /* normal char */ - if (c >= 128) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if ((unsigned)c > 0xffff && !s->is_utf16) { - /* XXX: should handle non BMP-1 code points */ - return re_parse_error(s, "malformed unicode char"); - } - } else { - p++; + break; } - break; - } - *pp = p; - return c; + *pp = p; + return c; } -static int re_emit_range(REParseState *s, const CharRange *cr) { - int len, i; - uint32_t high; - - len = (unsigned)cr->len / 2; - if (len >= 65535) - return re_parse_error(s, "too many ranges"); - if (len == 0) { - /* not sure it can really happen. Emit a match that is always - false */ - re_emit_op_u32(s, REOP_char32, -1); - } else { - high = cr->points[cr->len - 1]; - if (high == UINT32_MAX) - high = cr->points[cr->len - 2]; - if (high <= 0xffff) { - /* can use 16 bit ranges with the conversion that 0xffff = - infinity */ - re_emit_op_u16(s, REOP_range, len); - for (i = 0; i < cr->len; i += 2) { - dbuf_put_u16(&s->byte_code, cr->points[i]); - high = cr->points[i + 1] - 1; - if (high == UINT32_MAX - 1) - high = 0xffff; - dbuf_put_u16(&s->byte_code, high); - } +static int re_emit_range(REParseState *s, const CharRange *cr) +{ + int len, i; + uint32_t high; + + len = (unsigned)cr->len / 2; + if (len >= 65535) + return re_parse_error(s, "too many ranges"); + if (len == 0) { + /* not sure it can really happen. Emit a match that is always + false */ + re_emit_op_u32(s, REOP_char32, -1); } else { - re_emit_op_u16(s, REOP_range32, len); - for (i = 0; i < cr->len; i += 2) { - dbuf_put_u32(&s->byte_code, cr->points[i]); - dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); - } + high = cr->points[cr->len - 1]; + if (high == UINT32_MAX) + high = cr->points[cr->len - 2]; + if (high <= 0xffff) { + /* can use 16 bit ranges with the conversion that 0xffff = + infinity */ + re_emit_op_u16(s, REOP_range, len); + for(i = 0; i < cr->len; i += 2) { + dbuf_put_u16(&s->byte_code, cr->points[i]); + high = cr->points[i + 1] - 1; + if (high == UINT32_MAX - 1) + high = 0xffff; + dbuf_put_u16(&s->byte_code, high); + } + } else { + re_emit_op_u16(s, REOP_range32, len); + for(i = 0; i < cr->len; i += 2) { + dbuf_put_u32(&s->byte_code, cr->points[i]); + dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); + } + } } - } - return 0; + return 0; } -static int re_parse_char_class(REParseState *s, const uint8_t **pp) { - const uint8_t *p; - uint32_t c1, c2; - CharRange cr_s, *cr = &cr_s; - CharRange cr1_s, *cr1 = &cr1_s; - BOOL invert; - - cr_init(cr, s->opaque, lre_realloc); - p = *pp; - p++; /* skip '[' */ - invert = FALSE; - if (*p == '^') { - p++; - invert = TRUE; - } - for (;;) { - if (*p == ']') - break; - c1 = get_class_atom(s, cr1, &p, TRUE); - if ((int)c1 < 0) - goto fail; - if (*p == '-' && p[1] != ']') { - const uint8_t *p0 = p + 1; - if (c1 >= CLASS_RANGE_BASE) { - if (s->is_utf16) { - cr_free(cr1); - goto invalid_class_range; - } - /* Annex B: match '-' character */ - goto class_atom; - } - c2 = get_class_atom(s, cr1, &p0, TRUE); - if ((int)c2 < 0) - goto fail; - if (c2 >= CLASS_RANGE_BASE) { - cr_free(cr1); - if (s->is_utf16) { - goto invalid_class_range; +static int re_parse_char_class(REParseState *s, const uint8_t **pp) +{ + const uint8_t *p; + uint32_t c1, c2; + CharRange cr_s, *cr = &cr_s; + CharRange cr1_s, *cr1 = &cr1_s; + BOOL invert; + + cr_init(cr, s->opaque, lre_realloc); + p = *pp; + p++; /* skip '[' */ + + invert = FALSE; + if (*p == '^') { + p++; + invert = TRUE; + } + + for(;;) { + if (*p == ']') + break; + c1 = get_class_atom(s, cr1, &p, TRUE); + if ((int)c1 < 0) + goto fail; + if (*p == '-' && p[1] != ']') { + const uint8_t *p0 = p + 1; + if (c1 >= CLASS_RANGE_BASE) { + if (s->is_unicode) { + cr_free(cr1); + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + c2 = get_class_atom(s, cr1, &p0, TRUE); + if ((int)c2 < 0) + goto fail; + if (c2 >= CLASS_RANGE_BASE) { + cr_free(cr1); + if (s->is_unicode) { + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + p = p0; + if (c2 < c1) { + invalid_class_range: + re_parse_error(s, "invalid class range"); + goto fail; + } + if (cr_union_interval(cr, c1, c2)) + goto memory_error; + } else { + class_atom: + if (c1 >= CLASS_RANGE_BASE) { + int ret; + ret = cr_union1(cr, cr1->points, cr1->len); + cr_free(cr1); + if (ret) + goto memory_error; + } else { + if (cr_union_interval(cr, c1, c1)) + goto memory_error; + } } - /* Annex B: match '-' character */ - goto class_atom; - } - p = p0; - if (c2 < c1) { - invalid_class_range: - re_parse_error(s, "invalid class range"); - goto fail; - } - if (cr_union_interval(cr, c1, c2)) - goto memory_error; - } else { - class_atom: - if (c1 >= CLASS_RANGE_BASE) { - int ret; - ret = cr_union1(cr, cr1->points, cr1->len); - cr_free(cr1); - if (ret) - goto memory_error; - } else { - if (cr_union_interval(cr, c1, c1)) - goto memory_error; - } } - } - if (s->ignore_case) { - if (cr_canonicalize(cr)) - goto memory_error; - } - if (invert) { - if (cr_invert(cr)) - goto memory_error; - } - if (re_emit_range(s, cr)) - goto fail; - cr_free(cr); - p++; /* skip ']' */ - *pp = p; - return 0; -memory_error: - re_parse_out_of_memory(s); -fail: - cr_free(cr); - return -1; + if (s->ignore_case) { + if (cr_regexp_canonicalize(cr, s->is_unicode)) + goto memory_error; + } + if (invert) { + if (cr_invert(cr)) + goto memory_error; + } + if (re_emit_range(s, cr)) + goto fail; + cr_free(cr); + p++; /* skip ']' */ + *pp = p; + return 0; + memory_error: + re_parse_out_of_memory(s); + fail: + cr_free(cr); + return -1; } /* Return: - 1 if the opcodes in bc_buf[] always advance the character pointer. - 0 if the character pointer may not be advanced. - -1 if the code may depend on side effects of its previous execution - (backreference) + - true if the opcodes may not advance the char pointer + - false if the opcodes always advance the char pointer */ -static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) { - int pos, opcode, ret, len, i; - uint32_t val, last; - BOOL has_back_reference; - uint8_t capture_bitmap[CAPTURE_COUNT_MAX]; - - ret = -2; /* not known yet */ - pos = 0; - has_back_reference = FALSE; - memset(capture_bitmap, 0, sizeof(capture_bitmap)); - - while (pos < bc_buf_len) { - opcode = bc_buf[pos]; - len = reopcode_info[opcode].size; - switch (opcode) { - case REOP_range: - val = get_u16(bc_buf + pos + 1); - len += val * 4; - goto simple_char; - case REOP_range32: - val = get_u16(bc_buf + pos + 1); - len += val * 8; - goto simple_char; - case REOP_char: - case REOP_char32: - case REOP_dot: - case REOP_any: - simple_char: - if (ret == -2) - ret = 1; - break; - case REOP_line_start: - case REOP_line_end: - case REOP_push_i32: - case REOP_push_char_pos: - case REOP_drop: - case REOP_word_boundary: - case REOP_not_word_boundary: - case REOP_prev: - /* no effect */ - break; - case REOP_save_start: - case REOP_save_end: - val = bc_buf[pos + 1]; - capture_bitmap[val] |= 1; - break; - case REOP_save_reset: { - val = bc_buf[pos + 1]; - last = bc_buf[pos + 2]; - while (val < last) - capture_bitmap[val++] |= 1; - } break; - case REOP_back_reference: - case REOP_backward_back_reference: - val = bc_buf[pos + 1]; - capture_bitmap[val] |= 2; - has_back_reference = TRUE; - break; - default: - /* safe behvior: we cannot predict the outcome */ - if (ret == -2) - ret = 0; - break; - } - pos += len; - } - if (has_back_reference) { - /* check if there is back reference which references a capture - made in the some code */ - for (i = 0; i < CAPTURE_COUNT_MAX; i++) { - if (capture_bitmap[i] == 3) - return -1; +static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len) +{ + int pos, opcode, len; + uint32_t val; + BOOL ret; + + ret = TRUE; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + switch(opcode) { + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + goto simple_char; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + goto simple_char; + case REOP_char: + case REOP_char32: + case REOP_dot: + case REOP_any: + simple_char: + ret = FALSE; + break; + case REOP_line_start: + case REOP_line_end: + case REOP_push_i32: + case REOP_push_char_pos: + case REOP_drop: + case REOP_word_boundary: + case REOP_not_word_boundary: + case REOP_prev: + /* no effect */ + break; + case REOP_save_start: + case REOP_save_end: + case REOP_save_reset: + case REOP_back_reference: + case REOP_backward_back_reference: + break; + default: + /* safe behavior: we cannot predict the outcome */ + return TRUE; + } + pos += len; } - } - if (ret == -2) - ret = 0; - return ret; + return ret; } /* return -1 if a simple quantifier cannot be used. Otherwise return the number of characters in the atom. */ -static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) { - int pos, opcode, len, count; - uint32_t val; - - count = 0; - pos = 0; - while (pos < bc_buf_len) { - opcode = bc_buf[pos]; - len = reopcode_info[opcode].size; - switch (opcode) { - case REOP_range: - val = get_u16(bc_buf + pos + 1); - len += val * 4; - goto simple_char; - case REOP_range32: - val = get_u16(bc_buf + pos + 1); - len += val * 8; - goto simple_char; - case REOP_char: - case REOP_char32: - case REOP_dot: - case REOP_any: - simple_char: - count++; - break; - case REOP_line_start: - case REOP_line_end: - case REOP_word_boundary: - case REOP_not_word_boundary: - break; - default: - return -1; +static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) +{ + int pos, opcode, len, count; + uint32_t val; + + count = 0; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + switch(opcode) { + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + goto simple_char; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + goto simple_char; + case REOP_char: + case REOP_char32: + case REOP_dot: + case REOP_any: + simple_char: + count++; + break; + case REOP_line_start: + case REOP_line_end: + case REOP_word_boundary: + case REOP_not_word_boundary: + break; + default: + return -1; + } + pos += len; } - pos += len; - } - return count; + return count; } /* '*pp' is the first char after '<' */ -static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) { - const uint8_t *p, *p1; - uint32_t c, d; - char *q; - - p = *pp; - q = buf; - for (;;) { - c = *p; - if (c == '\\') { - p++; - if (*p != 'u') - return -1; - c = lre_parse_escape(&p, 2); // accept surrogate pairs - } else if (c == '>') { - break; - } else if (c >= 128) { - c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); - if (c >= 0xD800 && c <= 0xDBFF) { - d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); - if (d >= 0xDC00 && d <= 0xDFFF) { - c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00); - p = p1; +static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp) +{ + const uint8_t *p, *p1; + uint32_t c, d; + char *q; + + p = *pp; + q = buf; + for(;;) { + c = *p; + if (c == '\\') { + p++; + if (*p != 'u') + return -1; + c = lre_parse_escape(&p, 2); // accept surrogate pairs + } else if (c == '>') { + break; + } else if (c >= 128) { + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + if (is_hi_surrogate(c)) { + d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); + if (is_lo_surrogate(d)) { + c = from_surrogate(c, d); + p = p1; + } + } + } else { + p++; + } + if (c > 0x10FFFF) + return -1; + if (q == buf) { + if (!lre_js_is_ident_first(c)) + return -1; + } else { + if (!lre_js_is_ident_next(c)) + return -1; + } + if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) + return -1; + if (c < 128) { + *q++ = c; + } else { + q += unicode_to_utf8((uint8_t*)q, c); } - } - } else { - p++; } - if (c > 0x10FFFF) - return -1; - if (q == buf) { - if (!lre_js_is_ident_first(c)) - return -1; - } else { - if (!lre_js_is_ident_next(c)) + if (q == buf) return -1; - } - if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) - return -1; - if (c < 128) { - *q++ = c; - } else { - q += unicode_to_utf8((uint8_t *)q, c); - } - } - if (q == buf) - return -1; - *q = '\0'; - p++; - *pp = p; - return 0; + *q = '\0'; + p++; + *pp = p; + return 0; } /* if capture_name = NULL: return the number of captures + 1. Otherwise, return the capture index corresponding to capture_name or -1 if none */ static int re_parse_captures(REParseState *s, int *phas_named_captures, - const char *capture_name) { - const uint8_t *p; - int capture_index; - char name[TMP_BUF_SIZE]; - - capture_index = 1; - *phas_named_captures = 0; - for (p = s->buf_start; p < s->buf_end; p++) { - switch (*p) { - case '(': - if (p[1] == '?') { - if (p[2] == '<' && p[3] != '=' && p[3] != '!') { - *phas_named_captures = 1; - /* potential named capture */ - if (capture_name) { - p += 3; - if (re_parse_group_name(name, sizeof(name), &p) == 0) { - if (!strcmp(name, capture_name)) - return capture_index; + const char *capture_name) +{ + const uint8_t *p; + int capture_index; + char name[TMP_BUF_SIZE]; + + capture_index = 1; + *phas_named_captures = 0; + for (p = s->buf_start; p < s->buf_end; p++) { + switch (*p) { + case '(': + if (p[1] == '?') { + if (p[2] == '<' && p[3] != '=' && p[3] != '!') { + *phas_named_captures = 1; + /* potential named capture */ + if (capture_name) { + p += 3; + if (re_parse_group_name(name, sizeof(name), &p) == 0) { + if (!strcmp(name, capture_name)) + return capture_index; + } + } + capture_index++; + if (capture_index >= CAPTURE_COUNT_MAX) + goto done; + } + } else { + capture_index++; + if (capture_index >= CAPTURE_COUNT_MAX) + goto done; + } + break; + case '\\': + p++; + break; + case '[': + for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { + if (*p == '\\') + p++; } - } - capture_index++; - if (capture_index >= CAPTURE_COUNT_MAX) - goto done; + break; } - } else { - capture_index++; - if (capture_index >= CAPTURE_COUNT_MAX) - goto done; - } - break; - case '\\': - p++; - break; - case '[': - for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { - if (*p == '\\') - p++; - } - break; } - } -done: - if (capture_name) - return -1; - else - return capture_index; + done: + if (capture_name) + return -1; + else + return capture_index; } -static int re_count_captures(REParseState *s) { - if (s->total_capture_count < 0) { - s->total_capture_count = re_parse_captures(s, &s->has_named_captures, NULL); - } - return s->total_capture_count; +static int re_count_captures(REParseState *s) +{ + if (s->total_capture_count < 0) { + s->total_capture_count = re_parse_captures(s, &s->has_named_captures, + NULL); + } + return s->total_capture_count; } -static BOOL re_has_named_captures(REParseState *s) { - if (s->has_named_captures < 0) - re_count_captures(s); - return s->has_named_captures; +static BOOL re_has_named_captures(REParseState *s) +{ + if (s->has_named_captures < 0) + re_count_captures(s); + return s->has_named_captures; } -static int find_group_name(REParseState *s, const char *name) { - const char *p, *buf_end; - size_t len, name_len; - int capture_index; - - name_len = strlen(name); - p = (char *)s->group_names.buf; - buf_end = (char *)s->group_names.buf + s->group_names.size; - capture_index = 1; - while (p < buf_end) { - len = strlen(p); - if (len == name_len && memcmp(name, p, name_len) == 0) - return capture_index; - p += len + 1; - capture_index++; - } - return -1; +static int find_group_name(REParseState *s, const char *name) +{ + const char *p, *buf_end; + size_t len, name_len; + int capture_index; + + p = (char *)s->group_names.buf; + if (!p) return -1; + buf_end = (char *)s->group_names.buf + s->group_names.size; + name_len = strlen(name); + capture_index = 1; + while (p < buf_end) { + len = strlen(p); + if (len == name_len && memcmp(name, p, name_len) == 0) + return capture_index; + p += len + 1; + capture_index++; + } + return -1; } static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir); -static int re_parse_term(REParseState *s, BOOL is_backward_dir) { - const uint8_t *p; - int c, last_atom_start, quant_min, quant_max, last_capture_count; - BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; - CharRange cr_s, *cr = &cr_s; - - last_atom_start = -1; - last_capture_count = 0; - p = s->buf_ptr; - c = *p; - switch (c) { - case '^': - p++; - re_emit_op(s, REOP_line_start); - break; - case '$': - p++; - re_emit_op(s, REOP_line_end); - break; - case '.': - p++; - last_atom_start = s->byte_code.size; - last_capture_count = s->capture_count; - if (is_backward_dir) - re_emit_op(s, REOP_prev); - re_emit_op(s, s->dotall ? REOP_any : REOP_dot); - if (is_backward_dir) - re_emit_op(s, REOP_prev); - break; - case '{': - if (s->is_utf16) { - return re_parse_error(s, "syntax error"); - } else if (!is_digit(p[1])) { - /* Annex B: we accept '{' not followed by digits as a - normal atom */ - goto parse_class_atom; - } else { - const uint8_t *p1 = p + 1; - /* Annex B: error if it is like a repetition count */ - parse_digits(&p1, TRUE); - if (*p1 == ',') { - p1++; - if (is_digit(*p1)) { - parse_digits(&p1, TRUE); - } - } - if (*p1 != '}') { - goto parse_class_atom; - } - } - /* fall thru */ - case '*': - case '+': - case '?': - return re_parse_error(s, "nothing to repeat"); - case '(': - if (p[1] == '?') { - if (p[2] == ':') { - p += 3; +static int re_parse_term(REParseState *s, BOOL is_backward_dir) +{ + const uint8_t *p; + int c, last_atom_start, quant_min, quant_max, last_capture_count; + BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; + CharRange cr_s, *cr = &cr_s; + + last_atom_start = -1; + last_capture_count = 0; + p = s->buf_ptr; + c = *p; + switch(c) { + case '^': + p++; + re_emit_op(s, REOP_line_start); + break; + case '$': + p++; + re_emit_op(s, REOP_line_end); + break; + case '.': + p++; last_atom_start = s->byte_code.size; last_capture_count = s->capture_count; - s->buf_ptr = p; - if (re_parse_disjunction(s, is_backward_dir)) - return -1; - p = s->buf_ptr; - if (re_parse_expect(s, &p, ')')) - return -1; - } else if ((p[2] == '=' || p[2] == '!')) { - is_neg = (p[2] == '!'); - is_backward_lookahead = FALSE; - p += 3; - goto lookahead; - } else if (p[2] == '<' && (p[3] == '=' || p[3] == '!')) { - int pos; - is_neg = (p[3] == '!'); - is_backward_lookahead = TRUE; - p += 4; - /* lookahead */ - lookahead: - /* Annex B allows lookahead to be used as an atom for - the quantifiers */ - if (!s->is_utf16 && !is_backward_lookahead) { - last_atom_start = s->byte_code.size; - last_capture_count = s->capture_count; - } - pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); - s->buf_ptr = p; - if (re_parse_disjunction(s, is_backward_lookahead)) - return -1; - p = s->buf_ptr; - if (re_parse_expect(s, &p, ')')) - return -1; - re_emit_op(s, REOP_match); - /* jump after the 'match' after the lookahead is successful */ - if (dbuf_error(&s->byte_code)) - return -1; - put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); - } else if (p[2] == '<') { - p += 3; - if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), &p)) { - return re_parse_error(s, "invalid group name"); - } - if (find_group_name(s, s->u.tmp_buf) > 0) { - return re_parse_error(s, "duplicate group name"); - } - /* group name with a trailing zero */ - dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, - strlen(s->u.tmp_buf) + 1); - s->has_named_captures = 1; - goto parse_capture; - } else { - return re_parse_error(s, "invalid group"); - } - } else { - int capture_index; - p++; - /* capture without group name */ - dbuf_putc(&s->group_names, 0); - parse_capture: - if (s->capture_count >= CAPTURE_COUNT_MAX) - return re_parse_error(s, "too many captures"); - last_atom_start = s->byte_code.size; - last_capture_count = s->capture_count; - capture_index = s->capture_count++; - re_emit_op_u8(s, REOP_save_start + is_backward_dir, capture_index); - - s->buf_ptr = p; - if (re_parse_disjunction(s, is_backward_dir)) - return -1; - p = s->buf_ptr; - - re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, capture_index); - - if (re_parse_expect(s, &p, ')')) - return -1; - } - break; - case '\\': - switch (p[1]) { - case 'b': - case 'B': - re_emit_op(s, REOP_word_boundary + (p[1] != 'b')); - p += 2; - break; - case 'k': { - const uint8_t *p1; - int dummy_res; - - p1 = p; - if (p1[2] != '<') { - /* annex B: we tolerate invalid group names in non - unicode mode if there is no named capture - definition */ - if (s->is_utf16 || re_has_named_captures(s)) - return re_parse_error(s, "expecting group name"); - else - goto parse_class_atom; - } - p1 += 3; - if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), &p1)) { - if (s->is_utf16 || re_has_named_captures(s)) - return re_parse_error(s, "invalid group name"); - else - goto parse_class_atom; - } - c = find_group_name(s, s->u.tmp_buf); - if (c < 0) { - /* no capture name parsed before, try to look - after (inefficient, but hopefully not common */ - c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); - if (c < 0) { - if (s->is_utf16 || re_has_named_captures(s)) - return re_parse_error(s, "group name not defined"); - else + if (is_backward_dir) + re_emit_op(s, REOP_prev); + re_emit_op(s, s->dotall ? REOP_any : REOP_dot); + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + case '{': + if (s->is_unicode) { + return re_parse_error(s, "syntax error"); + } else if (!is_digit(p[1])) { + /* Annex B: we accept '{' not followed by digits as a + normal atom */ goto parse_class_atom; - } - } - p = p1; - } - goto emit_back_reference; - case '0': - p += 2; - c = 0; - if (s->is_utf16) { - if (is_digit(*p)) { - return re_parse_error(s, - "invalid decimal escape in regular expression"); - } - } else { - /* Annex B.1.4: accept legacy octal */ - if (*p >= '0' && *p <= '7') { - c = *p++ - '0'; - if (*p >= '0' && *p <= '7') { - c = (c << 3) + *p++ - '0'; - } - } - } - goto normal_char; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - const uint8_t *q = ++p; - - c = parse_digits(&p, FALSE); - if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { - if (!s->is_utf16) { - /* Annex B.1.4: accept legacy octal */ - p = q; - if (*p <= '7') { - c = 0; - if (*p <= '3') - c = *p++ - '0'; - if (*p >= '0' && *p <= '7') { - c = (c << 3) + *p++ - '0'; - if (*p >= '0' && *p <= '7') { - c = (c << 3) + *p++ - '0'; - } + } else { + const uint8_t *p1 = p + 1; + /* Annex B: error if it is like a repetition count */ + parse_digits(&p1, TRUE); + if (*p1 == ',') { + p1++; + if (is_digit(*p1)) { + parse_digits(&p1, TRUE); + } + } + if (*p1 != '}') { + goto parse_class_atom; } - } else { - c = *p++; - } - goto normal_char; } - return re_parse_error( - s, "back reference out of range in regular expression"); - } - emit_back_reference: - last_atom_start = s->byte_code.size; - last_capture_count = s->capture_count; - re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c); - } break; - default: - goto parse_class_atom; - } - break; - case '[': - last_atom_start = s->byte_code.size; - last_capture_count = s->capture_count; - if (is_backward_dir) - re_emit_op(s, REOP_prev); - if (re_parse_char_class(s, &p)) - return -1; - if (is_backward_dir) - re_emit_op(s, REOP_prev); - break; - case ']': - case '}': - if (s->is_utf16) - return re_parse_error(s, "syntax error"); - goto parse_class_atom; - default: - parse_class_atom: - c = get_class_atom(s, cr, &p, FALSE); - if ((int)c < 0) - return -1; - normal_char: - last_atom_start = s->byte_code.size; - last_capture_count = s->capture_count; - if (is_backward_dir) - re_emit_op(s, REOP_prev); - if (c >= CLASS_RANGE_BASE) { - int ret; - /* Note: canonicalization is not needed */ - ret = re_emit_range(s, cr); - cr_free(cr); - if (ret) - return -1; - } else { - if (s->ignore_case) - c = lre_canonicalize(c, s->is_utf16); - if (c <= 0xffff) - re_emit_op_u16(s, REOP_char, c); - else - re_emit_op_u32(s, REOP_char32, c); - } - if (is_backward_dir) - re_emit_op(s, REOP_prev); - break; - } - - /* quantifier */ - if (last_atom_start >= 0) { - c = *p; - switch (c) { + /* fall thru */ case '*': - p++; - quant_min = 0; - quant_max = INT32_MAX; - goto quantifier; case '+': - p++; - quant_min = 1; - quant_max = INT32_MAX; - goto quantifier; case '?': - p++; - quant_min = 0; - quant_max = 1; - goto quantifier; - case '{': { - const uint8_t *p1 = p; - /* As an extension (see ES6 annex B), we accept '{' not - followed by digits as a normal atom */ - if (!is_digit(p[1])) { - if (s->is_utf16) - goto invalid_quant_count; - break; - } - p++; - quant_min = parse_digits(&p, TRUE); - quant_max = quant_min; - if (*p == ',') { - p++; - if (is_digit(*p)) { - quant_max = parse_digits(&p, TRUE); - if (quant_max < quant_min) { - invalid_quant_count: - return re_parse_error(s, "invalid repetition count"); - } + return re_parse_error(s, "nothing to repeat"); + case '(': + if (p[1] == '?') { + if (p[2] == ':') { + p += 3; + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_dir)) + return -1; + p = s->buf_ptr; + if (re_parse_expect(s, &p, ')')) + return -1; + } else if ((p[2] == '=' || p[2] == '!')) { + is_neg = (p[2] == '!'); + is_backward_lookahead = FALSE; + p += 3; + goto lookahead; + } else if (p[2] == '<' && + (p[3] == '=' || p[3] == '!')) { + int pos; + is_neg = (p[3] == '!'); + is_backward_lookahead = TRUE; + p += 4; + /* lookahead */ + lookahead: + /* Annex B allows lookahead to be used as an atom for + the quantifiers */ + if (!s->is_unicode && !is_backward_lookahead) { + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + } + pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_lookahead)) + return -1; + p = s->buf_ptr; + if (re_parse_expect(s, &p, ')')) + return -1; + re_emit_op(s, REOP_match); + /* jump after the 'match' after the lookahead is successful */ + if (dbuf_error(&s->byte_code)) + return -1; + put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); + } else if (p[2] == '<') { + p += 3; + if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), + &p)) { + return re_parse_error(s, "invalid group name"); + } + if (find_group_name(s, s->u.tmp_buf) > 0) { + return re_parse_error(s, "duplicate group name"); + } + /* group name with a trailing zero */ + dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, + strlen(s->u.tmp_buf) + 1); + s->has_named_captures = 1; + goto parse_capture; + } else { + return re_parse_error(s, "invalid group"); + } } else { - quant_max = INT32_MAX; /* infinity */ + int capture_index; + p++; + /* capture without group name */ + dbuf_putc(&s->group_names, 0); + parse_capture: + if (s->capture_count >= CAPTURE_COUNT_MAX) + return re_parse_error(s, "too many captures"); + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + capture_index = s->capture_count++; + re_emit_op_u8(s, REOP_save_start + is_backward_dir, + capture_index); + + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_dir)) + return -1; + p = s->buf_ptr; + + re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, + capture_index); + + if (re_parse_expect(s, &p, ')')) + return -1; } - } - if (*p != '}' && !s->is_utf16) { - /* Annex B: normal atom if invalid '{' syntax */ - p = p1; break; - } - if (re_parse_expect(s, &p, '}')) - return -1; - } - quantifier: - greedy = TRUE; - if (*p == '?') { - p++; - greedy = FALSE; - } - if (last_atom_start < 0) { - return re_parse_error(s, "nothing to repeat"); - } - if (greedy) { - int len, pos; - - if (quant_max > 0) { - /* specific optimization for simple quantifiers */ - if (dbuf_error(&s->byte_code)) - goto out_of_memory; - len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, - s->byte_code.size - last_atom_start); - if (len > 0) { - re_emit_op(s, REOP_match); - - if (dbuf_insert(&s->byte_code, last_atom_start, 17)) - goto out_of_memory; - pos = last_atom_start; - s->byte_code.buf[pos++] = REOP_simple_greedy_quant; - put_u32(&s->byte_code.buf[pos], - s->byte_code.size - last_atom_start - 17); - pos += 4; - put_u32(&s->byte_code.buf[pos], quant_min); - pos += 4; - put_u32(&s->byte_code.buf[pos], quant_max); - pos += 4; - put_u32(&s->byte_code.buf[pos], len); - pos += 4; - goto done; - } - } - - if (dbuf_error(&s->byte_code)) - goto out_of_memory; - add_zero_advance_check = - (re_check_advance(s->byte_code.buf + last_atom_start, - s->byte_code.size - last_atom_start) == 0); - } else { - add_zero_advance_check = FALSE; - } - - { - int len, pos; - len = s->byte_code.size - last_atom_start; - if (quant_min == 0) { - /* need to reset the capture in case the atom is - not executed */ - if (last_capture_count != s->capture_count) { - if (dbuf_insert(&s->byte_code, last_atom_start, 3)) - goto out_of_memory; - s->byte_code.buf[last_atom_start++] = REOP_save_reset; - s->byte_code.buf[last_atom_start++] = last_capture_count; - s->byte_code.buf[last_atom_start++] = s->capture_count - 1; - } - if (quant_max == 0) { - s->byte_code.size = last_atom_start; - } else if (quant_max == 1) { - if (dbuf_insert(&s->byte_code, last_atom_start, 5)) - goto out_of_memory; - s->byte_code.buf[last_atom_start] = REOP_split_goto_first + greedy; - put_u32(s->byte_code.buf + last_atom_start + 1, len); - } else if (quant_max == INT32_MAX) { - if (dbuf_insert(&s->byte_code, last_atom_start, - 5 + add_zero_advance_check)) - goto out_of_memory; - s->byte_code.buf[last_atom_start] = REOP_split_goto_first + greedy; - put_u32(s->byte_code.buf + last_atom_start + 1, - len + 5 + add_zero_advance_check); - if (add_zero_advance_check) { - /* avoid infinite loop by stoping the - recursion if no advance was made in the - atom (only works if the atom has no - side effect) */ - s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; - re_emit_goto(s, REOP_bne_char_pos, last_atom_start); + case '\\': + switch(p[1]) { + case 'b': + case 'B': + re_emit_op(s, REOP_word_boundary + (p[1] != 'b')); + p += 2; + break; + case 'k': + { + const uint8_t *p1; + int dummy_res; + + p1 = p; + if (p1[2] != '<') { + /* annex B: we tolerate invalid group names in non + unicode mode if there is no named capture + definition */ + if (s->is_unicode || re_has_named_captures(s)) + return re_parse_error(s, "expecting group name"); + else + goto parse_class_atom; + } + p1 += 3; + if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), + &p1)) { + if (s->is_unicode || re_has_named_captures(s)) + return re_parse_error(s, "invalid group name"); + else + goto parse_class_atom; + } + c = find_group_name(s, s->u.tmp_buf); + if (c < 0) { + /* no capture name parsed before, try to look + after (inefficient, but hopefully not common */ + c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); + if (c < 0) { + if (s->is_unicode || re_has_named_captures(s)) + return re_parse_error(s, "group name not defined"); + else + goto parse_class_atom; + } + } + p = p1; + } + goto emit_back_reference; + case '0': + p += 2; + c = 0; + if (s->is_unicode) { + if (is_digit(*p)) { + return re_parse_error(s, "invalid decimal escape in regular expression"); + } } else { - re_emit_goto(s, REOP_goto, last_atom_start); + /* Annex B.1.4: accept legacy octal */ + if (*p >= '0' && *p <= '7') { + c = *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + } + } + } + goto normal_char; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': + case '9': + { + const uint8_t *q = ++p; + + c = parse_digits(&p, FALSE); + if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { + if (!s->is_unicode) { + /* Annex B.1.4: accept legacy octal */ + p = q; + if (*p <= '7') { + c = 0; + if (*p <= '3') + c = *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + } + } + } else { + c = *p++; + } + goto normal_char; + } + return re_parse_error(s, "back reference out of range in regular expression"); + } + emit_back_reference: + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c); } - } else { - if (dbuf_insert(&s->byte_code, last_atom_start, 10)) - goto out_of_memory; - pos = last_atom_start; - s->byte_code.buf[pos++] = REOP_push_i32; - put_u32(s->byte_code.buf + pos, quant_max); - pos += 4; - s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; - put_u32(s->byte_code.buf + pos, len + 5); - re_emit_goto(s, REOP_loop, last_atom_start + 5); - re_emit_op(s, REOP_drop); - } - } else if (quant_min == 1 && quant_max == INT32_MAX && - !add_zero_advance_check) { - re_emit_goto(s, REOP_split_next_first - greedy, last_atom_start); + break; + default: + goto parse_class_atom; + } + break; + case '[': + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + if (re_parse_char_class(s, &p)) + return -1; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + case ']': + case '}': + if (s->is_unicode) + return re_parse_error(s, "syntax error"); + goto parse_class_atom; + default: + parse_class_atom: + c = get_class_atom(s, cr, &p, FALSE); + if ((int)c < 0) + return -1; + normal_char: + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + if (c >= CLASS_RANGE_BASE) { + int ret; + /* Note: canonicalization is not needed */ + ret = re_emit_range(s, cr); + cr_free(cr); + if (ret) + return -1; } else { - if (quant_min == 1) { - /* nothing to add */ - } else { - if (dbuf_insert(&s->byte_code, last_atom_start, 5)) - goto out_of_memory; - s->byte_code.buf[last_atom_start] = REOP_push_i32; - put_u32(s->byte_code.buf + last_atom_start + 1, quant_min); - last_atom_start += 5; - re_emit_goto(s, REOP_loop, last_atom_start); - re_emit_op(s, REOP_drop); - } - if (quant_max == INT32_MAX) { - pos = s->byte_code.size; - re_emit_op_u32(s, REOP_split_goto_first + greedy, - len + 5 + add_zero_advance_check); - if (add_zero_advance_check) - re_emit_op(s, REOP_push_char_pos); - /* copy the atom */ - dbuf_put_self(&s->byte_code, last_atom_start, len); - if (add_zero_advance_check) - re_emit_goto(s, REOP_bne_char_pos, pos); + if (s->ignore_case) + c = lre_canonicalize(c, s->is_unicode); + if (c <= 0xffff) + re_emit_op_u16(s, REOP_char, c); else - re_emit_goto(s, REOP_goto, pos); - } else if (quant_max > quant_min) { - re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); - pos = s->byte_code.size; - re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5); - /* copy the atom */ - dbuf_put_self(&s->byte_code, last_atom_start, len); - - re_emit_goto(s, REOP_loop, pos); - re_emit_op(s, REOP_drop); - } + re_emit_op_u32(s, REOP_char32, c); } - last_atom_start = -1; - } - break; - default: - break; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; } - } -done: - s->buf_ptr = p; - return 0; -out_of_memory: - return re_parse_out_of_memory(s); + + /* quantifier */ + if (last_atom_start >= 0) { + c = *p; + switch(c) { + case '*': + p++; + quant_min = 0; + quant_max = INT32_MAX; + goto quantifier; + case '+': + p++; + quant_min = 1; + quant_max = INT32_MAX; + goto quantifier; + case '?': + p++; + quant_min = 0; + quant_max = 1; + goto quantifier; + case '{': + { + const uint8_t *p1 = p; + /* As an extension (see ES6 annex B), we accept '{' not + followed by digits as a normal atom */ + if (!is_digit(p[1])) { + if (s->is_unicode) + goto invalid_quant_count; + break; + } + p++; + quant_min = parse_digits(&p, TRUE); + quant_max = quant_min; + if (*p == ',') { + p++; + if (is_digit(*p)) { + quant_max = parse_digits(&p, TRUE); + if (quant_max < quant_min) { + invalid_quant_count: + return re_parse_error(s, "invalid repetition count"); + } + } else { + quant_max = INT32_MAX; /* infinity */ + } + } + if (*p != '}' && !s->is_unicode) { + /* Annex B: normal atom if invalid '{' syntax */ + p = p1; + break; + } + if (re_parse_expect(s, &p, '}')) + return -1; + } + quantifier: + greedy = TRUE; + if (*p == '?') { + p++; + greedy = FALSE; + } + if (last_atom_start < 0) { + return re_parse_error(s, "nothing to repeat"); + } + if (greedy) { + int len, pos; + + if (quant_max > 0) { + /* specific optimization for simple quantifiers */ + if (dbuf_error(&s->byte_code)) + goto out_of_memory; + len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start); + if (len > 0) { + re_emit_op(s, REOP_match); + + if (dbuf_insert(&s->byte_code, last_atom_start, 17)) + goto out_of_memory; + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_simple_greedy_quant; + put_u32(&s->byte_code.buf[pos], + s->byte_code.size - last_atom_start - 17); + pos += 4; + put_u32(&s->byte_code.buf[pos], quant_min); + pos += 4; + put_u32(&s->byte_code.buf[pos], quant_max); + pos += 4; + put_u32(&s->byte_code.buf[pos], len); + pos += 4; + goto done; + } + } + + if (dbuf_error(&s->byte_code)) + goto out_of_memory; + } + /* the spec tells that if there is no advance when + running the atom after the first quant_min times, + then there is no match. We remove this test when we + are sure the atom always advances the position. */ + add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start); + + { + int len, pos; + len = s->byte_code.size - last_atom_start; + if (quant_min == 0) { + /* need to reset the capture in case the atom is + not executed */ + if (last_capture_count != s->capture_count) { + if (dbuf_insert(&s->byte_code, last_atom_start, 3)) + goto out_of_memory; + s->byte_code.buf[last_atom_start++] = REOP_save_reset; + s->byte_code.buf[last_atom_start++] = last_capture_count; + s->byte_code.buf[last_atom_start++] = s->capture_count - 1; + } + if (quant_max == 0) { + s->byte_code.size = last_atom_start; + } else if (quant_max == 1 || quant_max == INT32_MAX) { + BOOL has_goto = (quant_max == INT32_MAX); + if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) + goto out_of_memory; + s->byte_code.buf[last_atom_start] = REOP_split_goto_first + + greedy; + put_u32(s->byte_code.buf + last_atom_start + 1, + len + 5 * has_goto + add_zero_advance_check * 2); + if (add_zero_advance_check) { + s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; + re_emit_op(s, REOP_check_advance); + } + if (has_goto) + re_emit_goto(s, REOP_goto, last_atom_start); + } else { + if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check)) + goto out_of_memory; + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_push_i32; + put_u32(s->byte_code.buf + pos, quant_max); + pos += 4; + s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; + put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2); + pos += 4; + if (add_zero_advance_check) { + s->byte_code.buf[pos++] = REOP_push_char_pos; + re_emit_op(s, REOP_check_advance); + } + re_emit_goto(s, REOP_loop, last_atom_start + 5); + re_emit_op(s, REOP_drop); + } + } else if (quant_min == 1 && quant_max == INT32_MAX && + !add_zero_advance_check) { + re_emit_goto(s, REOP_split_next_first - greedy, + last_atom_start); + } else { + if (quant_min == 1) { + /* nothing to add */ + } else { + if (dbuf_insert(&s->byte_code, last_atom_start, 5)) + goto out_of_memory; + s->byte_code.buf[last_atom_start] = REOP_push_i32; + put_u32(s->byte_code.buf + last_atom_start + 1, + quant_min); + last_atom_start += 5; + re_emit_goto(s, REOP_loop, last_atom_start); + re_emit_op(s, REOP_drop); + } + if (quant_max == INT32_MAX) { + pos = s->byte_code.size; + re_emit_op_u32(s, REOP_split_goto_first + greedy, + len + 5 + add_zero_advance_check * 2); + if (add_zero_advance_check) + re_emit_op(s, REOP_push_char_pos); + /* copy the atom */ + dbuf_put_self(&s->byte_code, last_atom_start, len); + if (add_zero_advance_check) + re_emit_op(s, REOP_check_advance); + re_emit_goto(s, REOP_goto, pos); + } else if (quant_max > quant_min) { + re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); + pos = s->byte_code.size; + re_emit_op_u32(s, REOP_split_goto_first + greedy, + len + 5 + add_zero_advance_check * 2); + if (add_zero_advance_check) + re_emit_op(s, REOP_push_char_pos); + /* copy the atom */ + dbuf_put_self(&s->byte_code, last_atom_start, len); + if (add_zero_advance_check) + re_emit_op(s, REOP_check_advance); + re_emit_goto(s, REOP_loop, pos); + re_emit_op(s, REOP_drop); + } + } + last_atom_start = -1; + } + break; + default: + break; + } + } + done: + s->buf_ptr = p; + return 0; + out_of_memory: + return re_parse_out_of_memory(s); } -static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) { - const uint8_t *p; - int ret; - size_t start, term_start, end, term_size; +static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) +{ + const uint8_t *p; + int ret; + size_t start, term_start, end, term_size; - start = s->byte_code.size; - for (;;) { - p = s->buf_ptr; - if (p >= s->buf_end) - break; - if (*p == '|' || *p == ')') - break; - term_start = s->byte_code.size; - ret = re_parse_term(s, is_backward_dir); - if (ret) - return ret; - if (is_backward_dir) { - /* reverse the order of the terms (XXX: inefficient, but - speed is not really critical here) */ - end = s->byte_code.size; - term_size = end - term_start; - if (dbuf_realloc(&s->byte_code, end + term_size)) - return -1; - memmove(s->byte_code.buf + start + term_size, s->byte_code.buf + start, - end - start); - memcpy(s->byte_code.buf + start, s->byte_code.buf + end, term_size); + start = s->byte_code.size; + for(;;) { + p = s->buf_ptr; + if (p >= s->buf_end) + break; + if (*p == '|' || *p == ')') + break; + term_start = s->byte_code.size; + ret = re_parse_term(s, is_backward_dir); + if (ret) + return ret; + if (is_backward_dir) { + /* reverse the order of the terms (XXX: inefficient, but + speed is not really critical here) */ + end = s->byte_code.size; + term_size = end - term_start; + if (dbuf_realloc(&s->byte_code, end + term_size)) + return -1; + memmove(s->byte_code.buf + start + term_size, + s->byte_code.buf + start, + end - start); + memcpy(s->byte_code.buf + start, s->byte_code.buf + end, + term_size); + } } - } - return 0; + return 0; } -static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) { - int start, len, pos; +static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) +{ + int start, len, pos; - if (lre_check_stack_overflow(s->opaque, 0)) - return re_parse_error(s, "stack overflow"); + if (lre_check_stack_overflow(s->opaque, 0)) + return re_parse_error(s, "stack overflow"); - start = s->byte_code.size; - if (re_parse_alternative(s, is_backward_dir)) - return -1; - while (*s->buf_ptr == '|') { - s->buf_ptr++; + start = s->byte_code.size; + if (re_parse_alternative(s, is_backward_dir)) + return -1; + while (*s->buf_ptr == '|') { + s->buf_ptr++; - len = s->byte_code.size - start; + len = s->byte_code.size - start; - /* insert a split before the first alternative */ - if (dbuf_insert(&s->byte_code, start, 5)) { - return re_parse_out_of_memory(s); - } - s->byte_code.buf[start] = REOP_split_next_first; - put_u32(s->byte_code.buf + start + 1, len + 5); + /* insert a split before the first alternative */ + if (dbuf_insert(&s->byte_code, start, 5)) { + return re_parse_out_of_memory(s); + } + s->byte_code.buf[start] = REOP_split_next_first; + put_u32(s->byte_code.buf + start + 1, len + 5); - pos = re_emit_op_u32(s, REOP_goto, 0); + pos = re_emit_op_u32(s, REOP_goto, 0); - if (re_parse_alternative(s, is_backward_dir)) - return -1; + if (re_parse_alternative(s, is_backward_dir)) + return -1; - /* patch the goto */ - len = s->byte_code.size - (pos + 4); - put_u32(s->byte_code.buf + pos, len); - } - return 0; + /* patch the goto */ + len = s->byte_code.size - (pos + 4); + put_u32(s->byte_code.buf + pos, len); + } + return 0; } /* the control flow is recursive so the analysis can be linear */ -static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) { - int stack_size, stack_size_max, pos, opcode, len; - uint32_t val; - - stack_size = 0; - stack_size_max = 0; - bc_buf += RE_HEADER_LEN; - bc_buf_len -= RE_HEADER_LEN; - pos = 0; - while (pos < bc_buf_len) { - opcode = bc_buf[pos]; - len = reopcode_info[opcode].size; - assert(opcode < REOP_COUNT); - assert((pos + len) <= bc_buf_len); - switch (opcode) { - case REOP_push_i32: - case REOP_push_char_pos: - stack_size++; - if (stack_size > stack_size_max) { - if (stack_size > STACK_SIZE_MAX) - return -1; - stack_size_max = stack_size; - } - break; - case REOP_drop: - case REOP_bne_char_pos: - assert(stack_size > 0); - stack_size--; - break; - case REOP_range: - val = get_u16(bc_buf + pos + 1); - len += val * 4; - break; - case REOP_range32: - val = get_u16(bc_buf + pos + 1); - len += val * 8; - break; +static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) +{ + int stack_size, stack_size_max, pos, opcode, len; + uint32_t val; + + stack_size = 0; + stack_size_max = 0; + bc_buf += RE_HEADER_LEN; + bc_buf_len -= RE_HEADER_LEN; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + assert(opcode < REOP_COUNT); + assert((pos + len) <= bc_buf_len); + switch(opcode) { + case REOP_push_i32: + case REOP_push_char_pos: + stack_size++; + if (stack_size > stack_size_max) { + if (stack_size > STACK_SIZE_MAX) + return -1; + stack_size_max = stack_size; + } + break; + case REOP_drop: + case REOP_check_advance: + assert(stack_size > 0); + stack_size--; + break; + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + break; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + break; + } + pos += len; } - pos += len; - } - return stack_size_max; + return stack_size_max; } /* 'buf' must be a zero terminated UTF-8 string of length buf_len. @@ -1785,750 +1718,784 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) { */ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, const char *buf, size_t buf_len, int re_flags, - void *opaque) { - REParseState s_s, *s = &s_s; - int stack_size; - BOOL is_sticky; - - memset(s, 0, sizeof(*s)); - s->opaque = opaque; - s->buf_ptr = (const uint8_t *)buf; - s->buf_end = s->buf_ptr + buf_len; - s->buf_start = s->buf_ptr; - s->re_flags = re_flags; - s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0); - is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); - s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); - s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); - s->capture_count = 1; - s->total_capture_count = -1; - s->has_named_captures = -1; - - dbuf_init2(&s->byte_code, opaque, lre_realloc); - dbuf_init2(&s->group_names, opaque, lre_realloc); - - dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ - dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ - dbuf_putc(&s->byte_code, 0); /* stack size */ - dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ - - if (!is_sticky) { - /* iterate thru all positions (about the same as .*?( ... ) ) - . We do it without an explicit loop so that lock step - thread execution will be possible in an optimized - implementation */ - re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); - re_emit_op(s, REOP_any); - re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); - } - re_emit_op_u8(s, REOP_save_start, 0); - - if (re_parse_disjunction(s, FALSE)) { - error: - dbuf_free(&s->byte_code); - dbuf_free(&s->group_names); - pstrcpy(error_msg, error_msg_size, s->u.error_msg); - *plen = 0; - return NULL; - } + void *opaque) +{ + REParseState s_s, *s = &s_s; + int stack_size; + BOOL is_sticky; + + memset(s, 0, sizeof(*s)); + s->opaque = opaque; + s->buf_ptr = (const uint8_t *)buf; + s->buf_end = s->buf_ptr + buf_len; + s->buf_start = s->buf_ptr; + s->re_flags = re_flags; + s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0); + is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); + s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); + s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); + s->capture_count = 1; + s->total_capture_count = -1; + s->has_named_captures = -1; + + dbuf_init2(&s->byte_code, opaque, lre_realloc); + dbuf_init2(&s->group_names, opaque, lre_realloc); + + dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ + dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ + dbuf_putc(&s->byte_code, 0); /* stack size */ + dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ + + if (!is_sticky) { + /* iterate thru all positions (about the same as .*?( ... ) ) + . We do it without an explicit loop so that lock step + thread execution will be possible in an optimized + implementation */ + re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); + re_emit_op(s, REOP_any); + re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); + } + re_emit_op_u8(s, REOP_save_start, 0); + + if (re_parse_disjunction(s, FALSE)) { + error: + dbuf_free(&s->byte_code); + dbuf_free(&s->group_names); + pstrcpy(error_msg, error_msg_size, s->u.error_msg); + *plen = 0; + return NULL; + } + + re_emit_op_u8(s, REOP_save_end, 0); - re_emit_op_u8(s, REOP_save_end, 0); + re_emit_op(s, REOP_match); - re_emit_op(s, REOP_match); + if (*s->buf_ptr != '\0') { + re_parse_error(s, "extraneous characters at the end"); + goto error; + } - if (*s->buf_ptr != '\0') { - re_parse_error(s, "extraneous characters at the end"); - goto error; - } + if (dbuf_error(&s->byte_code)) { + re_parse_out_of_memory(s); + goto error; + } - if (dbuf_error(&s->byte_code)) { - re_parse_out_of_memory(s); - goto error; - } - - stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); - if (stack_size < 0) { - re_parse_error(s, "too many imbricated quantifiers"); - goto error; - } - - s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; - s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; - put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN); - - /* add the named groups if needed */ - if (s->group_names.size > (s->capture_count - 1)) { - dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); - s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; - } - dbuf_free(&s->group_names); + stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); + if (stack_size < 0) { + re_parse_error(s, "too many imbricated quantifiers"); + goto error; + } + + s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; + s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; + put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN, + s->byte_code.size - RE_HEADER_LEN); + + /* add the named groups if needed */ + if (s->group_names.size > (s->capture_count - 1)) { + dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); + s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; + } + dbuf_free(&s->group_names); #ifdef DUMP_REOP - lre_dump_bytecode(s->byte_code.buf, s->byte_code.size); + lre_dump_bytecode(s->byte_code.buf, s->byte_code.size); #endif - error_msg[0] = '\0'; - *plen = s->byte_code.size; - return s->byte_code.buf; + error_msg[0] = '\0'; + *plen = s->byte_code.size; + return s->byte_code.buf; } -static BOOL is_line_terminator(uint32_t c) { - return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); +static BOOL is_line_terminator(uint32_t c) +{ + return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); } -static BOOL is_word_char(uint32_t c) { - return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || - (c >= 'A' && c <= 'Z') || (c == '_')); +static BOOL is_word_char(uint32_t c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c == '_')); } -#define GET_CHAR(c, cptr, cbuf_end) \ - do { \ - if (cbuf_type == 0) { \ - c = *cptr++; \ - } else { \ - uint32_t __c1; \ - c = *(uint16_t *)cptr; \ - cptr += 2; \ - if (c >= 0xd800 && c < 0xdc00 && cbuf_type == 2 && cptr < cbuf_end) { \ - __c1 = *(uint16_t *)cptr; \ - if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ - c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ - cptr += 2; \ - } \ - } \ - } \ - } while (0) - -#define PEEK_CHAR(c, cptr, cbuf_end) \ - do { \ - if (cbuf_type == 0) { \ - c = cptr[0]; \ - } else { \ - uint32_t __c1; \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xd800 && c < 0xdc00 && cbuf_type == 2 && \ - (cptr + 2) < cbuf_end) { \ - __c1 = ((uint16_t *)cptr)[1]; \ - if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ - c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ - } \ - } \ - } \ - } while (0) - -#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ - do { \ - if (cbuf_type == 0) { \ - c = cptr[-1]; \ - } else { \ - uint32_t __c1; \ - c = ((uint16_t *)cptr)[-1]; \ - if (c >= 0xdc00 && c < 0xe000 && cbuf_type == 2 && \ - (cptr - 4) >= cbuf_start) { \ - __c1 = ((uint16_t *)cptr)[-2]; \ - if (__c1 >= 0xd800 && __c1 < 0xdc00) { \ - c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ - } \ - } \ - } \ - } while (0) - -#define GET_PREV_CHAR(c, cptr, cbuf_start) \ - do { \ - if (cbuf_type == 0) { \ - cptr--; \ - c = cptr[0]; \ - } else { \ - uint32_t __c1; \ - cptr -= 2; \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xdc00 && c < 0xe000 && cbuf_type == 2 && cptr > cbuf_start) { \ - __c1 = ((uint16_t *)cptr)[-1]; \ - if (__c1 >= 0xd800 && __c1 < 0xdc00) { \ - cptr -= 2; \ - c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ - } \ - } \ - } \ - } while (0) - -#define PREV_CHAR(cptr, cbuf_start) \ - do { \ - if (cbuf_type == 0) { \ - cptr--; \ - } else { \ - cptr -= 2; \ - if (cbuf_type == 2) { \ - c = ((uint16_t *)cptr)[0]; \ - if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \ - c = ((uint16_t *)cptr)[-1]; \ - if (c >= 0xd800 && c < 0xdc00) \ - cptr -= 2; \ - } \ - } \ - } \ - } while (0) +#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \ + do { \ + if (cbuf_type == 0) { \ + c = *cptr++; \ + } else { \ + const uint16_t *_p = (const uint16_t *)cptr; \ + const uint16_t *_end = (const uint16_t *)cbuf_end; \ + c = *_p++; \ + if (is_hi_surrogate(c) && cbuf_type == 2) { \ + if (_p < _end && is_lo_surrogate(*_p)) { \ + c = from_surrogate(c, *_p++); \ + } \ + } \ + cptr = (const void *)_p; \ + } \ + } while (0) + +#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[0]; \ + } else { \ + const uint16_t *_p = (const uint16_t *)cptr; \ + const uint16_t *_end = (const uint16_t *)cbuf_end; \ + c = *_p++; \ + if (is_hi_surrogate(c) && cbuf_type == 2) { \ + if (_p < _end && is_lo_surrogate(*_p)) { \ + c = from_surrogate(c, *_p); \ + } \ + } \ + } \ + } while (0) + +#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[-1]; \ + } else { \ + const uint16_t *_p = (const uint16_t *)cptr - 1; \ + const uint16_t *_start = (const uint16_t *)cbuf_start; \ + c = *_p; \ + if (is_lo_surrogate(c) && cbuf_type == 2) { \ + if (_p > _start && is_hi_surrogate(_p[-1])) { \ + c = from_surrogate(*--_p, c); \ + } \ + } \ + } \ + } while (0) + +#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + c = cptr[0]; \ + } else { \ + const uint16_t *_p = (const uint16_t *)cptr - 1; \ + const uint16_t *_start = (const uint16_t *)cbuf_start; \ + c = *_p; \ + if (is_lo_surrogate(c) && cbuf_type == 2) { \ + if (_p > _start && is_hi_surrogate(_p[-1])) { \ + c = from_surrogate(*--_p, c); \ + } \ + } \ + cptr = (const void *)_p; \ + } \ + } while (0) + +#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + } else { \ + const uint16_t *_p = (const uint16_t *)cptr - 1; \ + const uint16_t *_start = (const uint16_t *)cbuf_start; \ + if (is_lo_surrogate(*_p) && cbuf_type == 2) { \ + if (_p > _start && is_hi_surrogate(_p[-1])) { \ + --_p; \ + } \ + } \ + cptr = (const void *)_p; \ + } \ + } while (0) typedef uintptr_t StackInt; typedef enum { - RE_EXEC_STATE_SPLIT, - RE_EXEC_STATE_LOOKAHEAD, - RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, - RE_EXEC_STATE_GREEDY_QUANT, + RE_EXEC_STATE_SPLIT, + RE_EXEC_STATE_LOOKAHEAD, + RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, + RE_EXEC_STATE_GREEDY_QUANT, } REExecStateEnum; typedef struct REExecState { - REExecStateEnum type : 8; - uint8_t stack_len; - size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ - const uint8_t *cptr; - const uint8_t *pc; - void *buf[0]; + REExecStateEnum type : 8; + uint8_t stack_len; + size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ + const uint8_t *cptr; + const uint8_t *pc; + void *buf[0]; } REExecState; typedef struct { - const uint8_t *cbuf; - const uint8_t *cbuf_end; - /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ - int cbuf_type; - int capture_count; - int stack_size_max; - BOOL multi_line; - BOOL ignore_case; - BOOL is_utf16; - void *opaque; /* used for stack overflow check */ - - size_t state_size; - uint8_t *state_stack; - size_t state_stack_size; - size_t state_stack_len; + const uint8_t *cbuf; + const uint8_t *cbuf_end; + /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ + int cbuf_type; + int capture_count; + int stack_size_max; + BOOL multi_line; + BOOL ignore_case; + BOOL is_unicode; + void *opaque; /* used for stack overflow check */ + + size_t state_size; + uint8_t *state_stack; + size_t state_stack_size; + size_t state_stack_len; } REExecContext; -static int push_state(REExecContext *s, uint8_t **capture, StackInt *stack, - size_t stack_len, const uint8_t *pc, const uint8_t *cptr, - REExecStateEnum type, size_t count) { - REExecState *rs; - uint8_t *new_stack; - size_t new_size, i, n; - StackInt *stack_buf; - - if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { - /* reallocate the stack */ - new_size = s->state_stack_size * 3 / 2; - if (new_size < 8) - new_size = 8; - new_stack = - lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); - if (!new_stack) - return -1; - s->state_stack_size = new_size; - s->state_stack = new_stack; - } - rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); - s->state_stack_len++; - rs->type = type; - rs->count = count; - rs->stack_len = stack_len; - rs->cptr = cptr; - rs->pc = pc; - n = 2 * s->capture_count; - for (i = 0; i < n; i++) - rs->buf[i] = capture[i]; - stack_buf = (StackInt *)(rs->buf + n); - for (i = 0; i < stack_len; i++) - stack_buf[i] = stack[i]; - return 0; +static int push_state(REExecContext *s, + uint8_t **capture, + StackInt *stack, size_t stack_len, + const uint8_t *pc, const uint8_t *cptr, + REExecStateEnum type, size_t count) +{ + REExecState *rs; + uint8_t *new_stack; + size_t new_size, i, n; + StackInt *stack_buf; + + if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { + /* reallocate the stack */ + new_size = s->state_stack_size * 3 / 2; + if (new_size < 8) + new_size = 8; + new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); + if (!new_stack) + return -1; + s->state_stack_size = new_size; + s->state_stack = new_stack; + } + rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); + s->state_stack_len++; + rs->type = type; + rs->count = count; + rs->stack_len = stack_len; + rs->cptr = cptr; + rs->pc = pc; + n = 2 * s->capture_count; + for(i = 0; i < n; i++) + rs->buf[i] = capture[i]; + stack_buf = (StackInt *)(rs->buf + n); + for(i = 0; i < stack_len; i++) + stack_buf[i] = stack[i]; + return 0; } /* return 1 if match, 0 if not match or -1 if error. */ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, StackInt *stack, int stack_len, const uint8_t *pc, const uint8_t *cptr, - BOOL no_recurse) { - int opcode, ret; - int cbuf_type; - uint32_t val, c; - const uint8_t *cbuf_end; - - cbuf_type = s->cbuf_type; - cbuf_end = s->cbuf_end; - - for (;;) { - // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + - // RE_HEADER_LEN))); - opcode = *pc++; - switch (opcode) { - case REOP_match: { - REExecState *rs; - if (no_recurse) - return (intptr_t)cptr; - ret = 1; - goto recurse; - no_match: - if (no_recurse) - return 0; - ret = 0; - recurse: - for (;;) { - if (s->state_stack_len == 0) - return ret; - rs = (REExecState *)(s->state_stack + - (s->state_stack_len - 1) * s->state_size); - if (rs->type == RE_EXEC_STATE_SPLIT) { - if (!ret) { - pop_state: - memcpy(capture, rs->buf, sizeof(capture[0]) * 2 * s->capture_count); - pop_state1: - pc = rs->pc; - cptr = rs->cptr; - stack_len = rs->stack_len; - memcpy(stack, rs->buf + 2 * s->capture_count, - stack_len * sizeof(stack[0])); - s->state_stack_len--; + BOOL no_recurse) +{ + int opcode, ret; + int cbuf_type; + uint32_t val, c; + const uint8_t *cbuf_end; + + cbuf_type = s->cbuf_type; + cbuf_end = s->cbuf_end; + + for(;;) { + // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN))); + opcode = *pc++; + switch(opcode) { + case REOP_match: + { + REExecState *rs; + if (no_recurse) + return (intptr_t)cptr; + ret = 1; + goto recurse; + no_match: + if (no_recurse) + return 0; + ret = 0; + recurse: + for(;;) { + if (s->state_stack_len == 0) + return ret; + rs = (REExecState *)(s->state_stack + + (s->state_stack_len - 1) * s->state_size); + if (rs->type == RE_EXEC_STATE_SPLIT) { + if (!ret) { + pop_state: + memcpy(capture, rs->buf, + sizeof(capture[0]) * 2 * s->capture_count); + pop_state1: + pc = rs->pc; + cptr = rs->cptr; + stack_len = rs->stack_len; + memcpy(stack, rs->buf + 2 * s->capture_count, + stack_len * sizeof(stack[0])); + s->state_stack_len--; + break; + } + } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { + if (!ret) { + uint32_t char_count, i; + memcpy(capture, rs->buf, + sizeof(capture[0]) * 2 * s->capture_count); + stack_len = rs->stack_len; + memcpy(stack, rs->buf + 2 * s->capture_count, + stack_len * sizeof(stack[0])); + pc = rs->pc; + cptr = rs->cptr; + /* go backward */ + char_count = get_u32(pc + 12); + for(i = 0; i < char_count; i++) { + PREV_CHAR(cptr, s->cbuf, cbuf_type); + } + pc = (pc + 16) + (int)get_u32(pc); + rs->cptr = cptr; + rs->count--; + if (rs->count == 0) { + s->state_stack_len--; + } + break; + } + } else { + ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || + (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); + if (ret) { + /* keep the capture in case of positive lookahead */ + if (rs->type == RE_EXEC_STATE_LOOKAHEAD) + goto pop_state1; + else + goto pop_state; + } + } + s->state_stack_len--; + } + } + break; + case REOP_char32: + val = get_u32(pc); + pc += 4; + goto test_char; + case REOP_char: + val = get_u16(pc); + pc += 2; + test_char: + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_unicode); + } + if (val != c) + goto no_match; break; - } - } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { - if (!ret) { - uint32_t char_count, i; - memcpy(capture, rs->buf, sizeof(capture[0]) * 2 * s->capture_count); - stack_len = rs->stack_len; - memcpy(stack, rs->buf + 2 * s->capture_count, - stack_len * sizeof(stack[0])); - pc = rs->pc; - cptr = rs->cptr; - /* go backward */ - char_count = get_u32(pc + 12); - for (i = 0; i < char_count; i++) { - PREV_CHAR(cptr, s->cbuf); + case REOP_split_goto_first: + case REOP_split_next_first: + { + const uint8_t *pc1; + + val = get_u32(pc); + pc += 4; + if (opcode == REOP_split_next_first) { + pc1 = pc + (int)val; + } else { + pc1 = pc; + pc = pc + (int)val; + } + ret = push_state(s, capture, stack, stack_len, + pc1, cptr, RE_EXEC_STATE_SPLIT, 0); + if (ret < 0) + return -1; + break; } - pc = (pc + 16) + (int)get_u32(pc); - rs->cptr = cptr; - rs->count--; - if (rs->count == 0) { - s->state_stack_len--; + case REOP_lookahead: + case REOP_negative_lookahead: + val = get_u32(pc); + pc += 4; + ret = push_state(s, capture, stack, stack_len, + pc + (int)val, cptr, + RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, + 0); + if (ret < 0) + return -1; + break; + + case REOP_goto: + val = get_u32(pc); + pc += 4 + (int)val; + break; + case REOP_line_start: + if (cptr == s->cbuf) + break; + if (!s->multi_line) + goto no_match; + PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); + if (!is_line_terminator(c)) + goto no_match; + break; + case REOP_line_end: + if (cptr == cbuf_end) + break; + if (!s->multi_line) + goto no_match; + PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); + if (!is_line_terminator(c)) + goto no_match; + break; + case REOP_dot: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + if (is_line_terminator(c)) + goto no_match; + break; + case REOP_any: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + break; + case REOP_save_start: + case REOP_save_end: + val = *pc++; + assert(val < s->capture_count); + capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; + break; + case REOP_save_reset: + { + uint32_t val2; + val = pc[0]; + val2 = pc[1]; + pc += 2; + assert(val2 < s->capture_count); + while (val <= val2) { + capture[2 * val] = NULL; + capture[2 * val + 1] = NULL; + val++; + } } break; - } - } else { - ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || - (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); - if (ret) { - /* keep the capture in case of positive lookahead */ - if (rs->type == RE_EXEC_STATE_LOOKAHEAD) - goto pop_state1; - else - goto pop_state; - } - } - s->state_stack_len--; - } - } break; - case REOP_char32: - val = get_u32(pc); - pc += 4; - goto test_char; - case REOP_char: - val = get_u16(pc); - pc += 2; - test_char: - if (cptr >= cbuf_end) - goto no_match; - GET_CHAR(c, cptr, cbuf_end); - if (s->ignore_case) { - c = lre_canonicalize(c, s->is_utf16); - } - if (val != c) - goto no_match; - break; - case REOP_split_goto_first: - case REOP_split_next_first: { - const uint8_t *pc1; - - val = get_u32(pc); - pc += 4; - if (opcode == REOP_split_next_first) { - pc1 = pc + (int)val; - } else { - pc1 = pc; - pc = pc + (int)val; - } - ret = push_state(s, capture, stack, stack_len, pc1, cptr, - RE_EXEC_STATE_SPLIT, 0); - if (ret < 0) - return -1; - break; - } - case REOP_lookahead: - case REOP_negative_lookahead: - val = get_u32(pc); - pc += 4; - ret = push_state(s, capture, stack, stack_len, pc + (int)val, cptr, - RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, 0); - if (ret < 0) - return -1; - break; - - case REOP_goto: - val = get_u32(pc); - pc += 4 + (int)val; - break; - case REOP_line_start: - if (cptr == s->cbuf) - break; - if (!s->multi_line) - goto no_match; - PEEK_PREV_CHAR(c, cptr, s->cbuf); - if (!is_line_terminator(c)) - goto no_match; - break; - case REOP_line_end: - if (cptr == cbuf_end) - break; - if (!s->multi_line) - goto no_match; - PEEK_CHAR(c, cptr, cbuf_end); - if (!is_line_terminator(c)) - goto no_match; - break; - case REOP_dot: - if (cptr == cbuf_end) - goto no_match; - GET_CHAR(c, cptr, cbuf_end); - if (is_line_terminator(c)) - goto no_match; - break; - case REOP_any: - if (cptr == cbuf_end) - goto no_match; - GET_CHAR(c, cptr, cbuf_end); - break; - case REOP_save_start: - case REOP_save_end: - val = *pc++; - assert(val < s->capture_count); - capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; - break; - case REOP_save_reset: { - uint32_t val2; - val = pc[0]; - val2 = pc[1]; - pc += 2; - assert(val2 < s->capture_count); - while (val <= val2) { - capture[2 * val] = NULL; - capture[2 * val + 1] = NULL; - val++; - } - } break; - case REOP_push_i32: - val = get_u32(pc); - pc += 4; - stack[stack_len++] = val; - break; - case REOP_drop: - stack_len--; - break; - case REOP_loop: - val = get_u32(pc); - pc += 4; - if (--stack[stack_len - 1] != 0) { - pc += (int)val; - } - break; - case REOP_push_char_pos: - stack[stack_len++] = (uintptr_t)cptr; - break; - case REOP_bne_char_pos: - val = get_u32(pc); - pc += 4; - if (stack[--stack_len] != (uintptr_t)cptr) - pc += (int)val; - break; - case REOP_word_boundary: - case REOP_not_word_boundary: { - BOOL v1, v2; - /* char before */ - if (cptr == s->cbuf) { - v1 = FALSE; - } else { - PEEK_PREV_CHAR(c, cptr, s->cbuf); - v1 = is_word_char(c); - } - /* current char */ - if (cptr >= cbuf_end) { - v2 = FALSE; - } else { - PEEK_CHAR(c, cptr, cbuf_end); - v2 = is_word_char(c); - } - if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) - goto no_match; - } break; - case REOP_back_reference: - case REOP_backward_back_reference: { - const uint8_t *cptr1, *cptr1_end, *cptr1_start; - uint32_t c1, c2; - - val = *pc++; - if (val >= s->capture_count) - goto no_match; - cptr1_start = capture[2 * val]; - cptr1_end = capture[2 * val + 1]; - if (!cptr1_start || !cptr1_end) - break; - if (opcode == REOP_back_reference) { - cptr1 = cptr1_start; - while (cptr1 < cptr1_end) { - if (cptr >= cbuf_end) - goto no_match; - GET_CHAR(c1, cptr1, cptr1_end); - GET_CHAR(c2, cptr, cbuf_end); - if (s->ignore_case) { - c1 = lre_canonicalize(c1, s->is_utf16); - c2 = lre_canonicalize(c2, s->is_utf16); - } - if (c1 != c2) - goto no_match; - } - } else { - cptr1 = cptr1_end; - while (cptr1 > cptr1_start) { - if (cptr == s->cbuf) - goto no_match; - GET_PREV_CHAR(c1, cptr1, cptr1_start); - GET_PREV_CHAR(c2, cptr, s->cbuf); - if (s->ignore_case) { - c1 = lre_canonicalize(c1, s->is_utf16); - c2 = lre_canonicalize(c2, s->is_utf16); - } - if (c1 != c2) - goto no_match; + case REOP_push_i32: + val = get_u32(pc); + pc += 4; + stack[stack_len++] = val; + break; + case REOP_drop: + stack_len--; + break; + case REOP_loop: + val = get_u32(pc); + pc += 4; + if (--stack[stack_len - 1] != 0) { + pc += (int)val; + } + break; + case REOP_push_char_pos: + stack[stack_len++] = (uintptr_t)cptr; + break; + case REOP_check_advance: + if (stack[--stack_len] == (uintptr_t)cptr) + goto no_match; + break; + case REOP_word_boundary: + case REOP_not_word_boundary: + { + BOOL v1, v2; + /* char before */ + if (cptr == s->cbuf) { + v1 = FALSE; + } else { + PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); + v1 = is_word_char(c); + } + /* current char */ + if (cptr >= cbuf_end) { + v2 = FALSE; + } else { + PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); + v2 = is_word_char(c); + } + if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) + goto no_match; + } + break; + case REOP_back_reference: + case REOP_backward_back_reference: + { + const uint8_t *cptr1, *cptr1_end, *cptr1_start; + uint32_t c1, c2; + + val = *pc++; + if (val >= s->capture_count) + goto no_match; + cptr1_start = capture[2 * val]; + cptr1_end = capture[2 * val + 1]; + if (!cptr1_start || !cptr1_end) + break; + if (opcode == REOP_back_reference) { + cptr1 = cptr1_start; + while (cptr1 < cptr1_end) { + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c1, cptr1, cptr1_end, cbuf_type); + GET_CHAR(c2, cptr, cbuf_end, cbuf_type); + if (s->ignore_case) { + c1 = lre_canonicalize(c1, s->is_unicode); + c2 = lre_canonicalize(c2, s->is_unicode); + } + if (c1 != c2) + goto no_match; + } + } else { + cptr1 = cptr1_end; + while (cptr1 > cptr1_start) { + if (cptr == s->cbuf) + goto no_match; + GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type); + GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type); + if (s->ignore_case) { + c1 = lre_canonicalize(c1, s->is_unicode); + c2 = lre_canonicalize(c2, s->is_unicode); + } + if (c1 != c2) + goto no_match; + } + } + } + break; + case REOP_range: + { + int n; + uint32_t low, high, idx_min, idx_max, idx; + + n = get_u16(pc); /* n must be >= 1 */ + pc += 2; + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_unicode); + } + idx_min = 0; + low = get_u16(pc + 0 * 4); + if (c < low) + goto no_match; + idx_max = n - 1; + high = get_u16(pc + idx_max * 4 + 2); + /* 0xffff in for last value means +infinity */ + if (unlikely(c >= 0xffff) && high == 0xffff) + goto range_match; + if (c > high) + goto no_match; + while (idx_min <= idx_max) { + idx = (idx_min + idx_max) / 2; + low = get_u16(pc + idx * 4); + high = get_u16(pc + idx * 4 + 2); + if (c < low) + idx_max = idx - 1; + else if (c > high) + idx_min = idx + 1; + else + goto range_match; + } + goto no_match; + range_match: + pc += 4 * n; + } + break; + case REOP_range32: + { + int n; + uint32_t low, high, idx_min, idx_max, idx; + + n = get_u16(pc); /* n must be >= 1 */ + pc += 2; + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end, cbuf_type); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_unicode); + } + idx_min = 0; + low = get_u32(pc + 0 * 8); + if (c < low) + goto no_match; + idx_max = n - 1; + high = get_u32(pc + idx_max * 8 + 4); + if (c > high) + goto no_match; + while (idx_min <= idx_max) { + idx = (idx_min + idx_max) / 2; + low = get_u32(pc + idx * 8); + high = get_u32(pc + idx * 8 + 4); + if (c < low) + idx_max = idx - 1; + else if (c > high) + idx_min = idx + 1; + else + goto range32_match; + } + goto no_match; + range32_match: + pc += 8 * n; + } + break; + case REOP_prev: + /* go to the previous char */ + if (cptr == s->cbuf) + goto no_match; + PREV_CHAR(cptr, s->cbuf, cbuf_type); + break; + case REOP_simple_greedy_quant: + { + uint32_t next_pos, quant_min, quant_max; + size_t q; + intptr_t res; + const uint8_t *pc1; + + next_pos = get_u32(pc); + quant_min = get_u32(pc + 4); + quant_max = get_u32(pc + 8); + pc += 16; + pc1 = pc; + pc += (int)next_pos; + + q = 0; + for(;;) { + res = lre_exec_backtrack(s, capture, stack, stack_len, + pc1, cptr, TRUE); + if (res == -1) + return res; + if (!res) + break; + cptr = (uint8_t *)res; + q++; + if (q >= quant_max && quant_max != INT32_MAX) + break; + } + if (q < quant_min) + goto no_match; + if (q > quant_min) { + /* will examine all matches down to quant_min */ + ret = push_state(s, capture, stack, stack_len, + pc1 - 16, cptr, + RE_EXEC_STATE_GREEDY_QUANT, + q - quant_min); + if (ret < 0) + return -1; + } + } + break; + default: + abort(); } - } - } break; - case REOP_range: { - int n; - uint32_t low, high, idx_min, idx_max, idx; - - n = get_u16(pc); /* n must be >= 1 */ - pc += 2; - if (cptr >= cbuf_end) - goto no_match; - GET_CHAR(c, cptr, cbuf_end); - if (s->ignore_case) { - c = lre_canonicalize(c, s->is_utf16); - } - idx_min = 0; - low = get_u16(pc + 0 * 4); - if (c < low) - goto no_match; - idx_max = n - 1; - high = get_u16(pc + idx_max * 4 + 2); - /* 0xffff in for last value means +infinity */ - if (unlikely(c >= 0xffff) && high == 0xffff) - goto range_match; - if (c > high) - goto no_match; - while (idx_min <= idx_max) { - idx = (idx_min + idx_max) / 2; - low = get_u16(pc + idx * 4); - high = get_u16(pc + idx * 4 + 2); - if (c < low) - idx_max = idx - 1; - else if (c > high) - idx_min = idx + 1; - else - goto range_match; - } - goto no_match; - range_match: - pc += 4 * n; - } break; - case REOP_range32: { - int n; - uint32_t low, high, idx_min, idx_max, idx; - - n = get_u16(pc); /* n must be >= 1 */ - pc += 2; - if (cptr >= cbuf_end) - goto no_match; - GET_CHAR(c, cptr, cbuf_end); - if (s->ignore_case) { - c = lre_canonicalize(c, s->is_utf16); - } - idx_min = 0; - low = get_u32(pc + 0 * 8); - if (c < low) - goto no_match; - idx_max = n - 1; - high = get_u32(pc + idx_max * 8 + 4); - if (c > high) - goto no_match; - while (idx_min <= idx_max) { - idx = (idx_min + idx_max) / 2; - low = get_u32(pc + idx * 8); - high = get_u32(pc + idx * 8 + 4); - if (c < low) - idx_max = idx - 1; - else if (c > high) - idx_min = idx + 1; - else - goto range32_match; - } - goto no_match; - range32_match: - pc += 8 * n; - } break; - case REOP_prev: - /* go to the previous char */ - if (cptr == s->cbuf) - goto no_match; - PREV_CHAR(cptr, s->cbuf); - break; - case REOP_simple_greedy_quant: { - uint32_t next_pos, quant_min, quant_max; - size_t q; - intptr_t res; - const uint8_t *pc1; - - next_pos = get_u32(pc); - quant_min = get_u32(pc + 4); - quant_max = get_u32(pc + 8); - pc += 16; - pc1 = pc; - pc += (int)next_pos; - - q = 0; - for (;;) { - res = lre_exec_backtrack(s, capture, stack, stack_len, pc1, cptr, TRUE); - if (res == -1) - return res; - if (!res) - break; - cptr = (uint8_t *)res; - q++; - if (q >= quant_max && quant_max != INT32_MAX) - break; - } - if (q < quant_min) - goto no_match; - if (q > quant_min) { - /* will examine all matches down to quant_min */ - ret = push_state(s, capture, stack, stack_len, pc1 - 16, cptr, - RE_EXEC_STATE_GREEDY_QUANT, q - quant_min); - if (ret < 0) - return -1; - } - } break; - default: - abort(); } - } } /* Return 1 if match, 0 if not match or -1 if error. cindex is the starting position of the match and must be such as 0 <= cindex <= clen. */ -int lre_exec(uint8_t **capture, const uint8_t *bc_buf, const uint8_t *cbuf, - int cindex, int clen, int cbuf_type, void *opaque) { - REExecContext s_s, *s = &s_s; - int re_flags, i, alloca_size, ret; - StackInt *stack_buf; - - re_flags = bc_buf[RE_HEADER_FLAGS]; - s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; - s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; - s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0; - s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; - s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; - s->cbuf = cbuf; - s->cbuf_end = cbuf + (clen << cbuf_type); - s->cbuf_type = cbuf_type; - if (s->cbuf_type == 1 && s->is_utf16) - s->cbuf_type = 2; - s->opaque = opaque; - - s->state_size = sizeof(REExecState) + - s->capture_count * sizeof(capture[0]) * 2 + - s->stack_size_max * sizeof(stack_buf[0]); - s->state_stack = NULL; - s->state_stack_len = 0; - s->state_stack_size = 0; - - for (i = 0; i < s->capture_count * 2; i++) - capture[i] = NULL; - alloca_size = s->stack_size_max * sizeof(stack_buf[0]); - stack_buf = alloca(alloca_size); - ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, - cbuf + (cindex << cbuf_type), FALSE); - lre_realloc(s->opaque, s->state_stack, 0); - return ret; +int lre_exec(uint8_t **capture, + const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen, + int cbuf_type, void *opaque) +{ + REExecContext s_s, *s = &s_s; + int re_flags, i, alloca_size, ret; + StackInt *stack_buf; + + re_flags = lre_get_flags(bc_buf); + s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; + s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; + s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; + s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; + s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; + s->cbuf = cbuf; + s->cbuf_end = cbuf + (clen << cbuf_type); + s->cbuf_type = cbuf_type; + if (s->cbuf_type == 1 && s->is_unicode) + s->cbuf_type = 2; + s->opaque = opaque; + + s->state_size = sizeof(REExecState) + + s->capture_count * sizeof(capture[0]) * 2 + + s->stack_size_max * sizeof(stack_buf[0]); + s->state_stack = NULL; + s->state_stack_len = 0; + s->state_stack_size = 0; + + for(i = 0; i < s->capture_count * 2; i++) + capture[i] = NULL; + alloca_size = s->stack_size_max * sizeof(stack_buf[0]); + stack_buf = alloca(alloca_size); + ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, + cbuf + (cindex << cbuf_type), FALSE); + lre_realloc(s->opaque, s->state_stack, 0); + return ret; } -int lre_get_capture_count(const uint8_t *bc_buf) { - return bc_buf[RE_HEADER_CAPTURE_COUNT]; +int lre_get_capture_count(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_CAPTURE_COUNT]; } -int lre_get_flags(const uint8_t *bc_buf) { return bc_buf[RE_HEADER_FLAGS]; } +int lre_get_flags(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_FLAGS]; +} /* Return NULL if no group names. Otherwise, return a pointer to 'capture_count - 1' zero terminated UTF-8 strings. */ -const char *lre_get_groupnames(const uint8_t *bc_buf) { - uint32_t re_bytecode_len; - if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) - return NULL; - re_bytecode_len = get_u32(bc_buf + 3); - return (const char *)(bc_buf + 7 + re_bytecode_len); +const char *lre_get_groupnames(const uint8_t *bc_buf) +{ + uint32_t re_bytecode_len; + if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) + return NULL; + re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN); + return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len); } -BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) { - return FALSE; +#ifdef TEST + +BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) +{ + return FALSE; } -void *lre_realloc(void *opaque, void *ptr, size_t size) { - return realloc(ptr, size); +void *lre_realloc(void *opaque, void *ptr, size_t size) +{ + return realloc(ptr, size); } -#ifdef TEST +int main(int argc, char **argv) +{ + int len, flags, ret, i; + uint8_t *bc; + char error_msg[64]; + uint8_t *capture[CAPTURE_COUNT_MAX * 2]; + const char *input; + int input_len, capture_count; + + if (argc < 4) { + printf("usage: %s regexp flags input\n", argv[0]); + return 1; + } + flags = atoi(argv[2]); + bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], + strlen(argv[1]), flags, NULL); + if (!bc) { + fprintf(stderr, "error: %s\n", error_msg); + exit(1); + } -int main(int argc, char **argv) { - int len, ret, i; - uint8_t *bc; - char error_msg[64]; - uint8_t *capture[CAPTURE_COUNT_MAX * 2]; - const char *input; - int input_len, capture_count; - - if (argc < 3) { - printf("usage: %s regexp input\n", argv[0]); - exit(1); - } - bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], strlen(argv[1]), - 0, NULL); - if (!bc) { - fprintf(stderr, "error: %s\n", error_msg); - exit(1); - } - - input = argv[2]; - input_len = strlen(input); - - ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); - printf("ret=%d\n", ret); - if (ret == 1) { - capture_count = lre_get_capture_count(bc); - for (i = 0; i < 2 * capture_count; i++) { - uint8_t *ptr; - ptr = capture[i]; - printf("%d: ", i); - if (!ptr) - printf(""); - else - printf("%u", (int)(ptr - (uint8_t *)input)); - printf("\n"); + input = argv[3]; + input_len = strlen(input); + + ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); + printf("ret=%d\n", ret); + if (ret == 1) { + capture_count = lre_get_capture_count(bc); + for(i = 0; i < 2 * capture_count; i++) { + uint8_t *ptr; + ptr = capture[i]; + printf("%d: ", i); + if (!ptr) + printf(""); + else + printf("%u", (int)(ptr - (uint8_t *)input)); + printf("\n"); + } } - } - return 0; + return 0; } #endif diff --git a/libregexp/libregexp.h b/libregexp/libregexp.h index 9aedb7e..7af7ece 100644 --- a/libregexp/libregexp.h +++ b/libregexp/libregexp.h @@ -1,6 +1,6 @@ /* * Regular Expression Engine - * + * * Copyright (c) 2017-2018 Fabrice Bellard * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -25,18 +25,15 @@ #define LIBREGEXP_H #include - -#include "libunicode.h" - -#define LRE_BOOL int /* for documentation purposes */ +#include #define LRE_FLAG_GLOBAL (1 << 0) #define LRE_FLAG_IGNORECASE (1 << 1) #define LRE_FLAG_MULTILINE (1 << 2) #define LRE_FLAG_DOTALL (1 << 3) -#define LRE_FLAG_UTF16 (1 << 4) +#define LRE_FLAG_UNICODE (1 << 4) #define LRE_FLAG_STICKY (1 << 5) - +#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */ #define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, @@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture, int cbuf_type, void *opaque); int lre_parse_escape(const uint8_t **pp, int allow_utf16); -LRE_BOOL lre_is_space(int c); -/* must be provided by the user */ -LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size); +/* must be provided by the user, return non zero if overflow */ +int lre_check_stack_overflow(void *opaque, size_t alloca_size); void *lre_realloc(void *opaque, void *ptr, size_t size); -/* JS identifier test */ -extern uint32_t const lre_id_start_table_ascii[4]; -extern uint32_t const lre_id_continue_table_ascii[4]; - -static inline int lre_js_is_ident_first(int c) -{ - if ((uint32_t)c < 128) { - return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1; - } else { -#ifdef CONFIG_ALL_UNICODE - return lre_is_id_start(c); -#else - return !lre_is_space(c); -#endif - } -} - -static inline int lre_js_is_ident_next(int c) -{ - if ((uint32_t)c < 128) { - return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1; - } else { - /* ZWNJ and ZWJ are accepted in identifiers */ -#ifdef CONFIG_ALL_UNICODE - return lre_is_id_continue(c) || c == 0x200C || c == 0x200D; -#else - return !lre_is_space(c) || c == 0x200C || c == 0x200D; -#endif - } -} - -#undef LRE_BOOL - #endif /* LIBREGEXP_H */ diff --git a/libregexp/libunicode-table.h b/libregexp/libunicode-table.h index b64178b..72d495e 100644 --- a/libregexp/libunicode-table.h +++ b/libregexp/libunicode-table.h @@ -189,9 +189,13 @@ static const uint8_t unicode_prop_Cased1_table[196] = { }; static const uint8_t unicode_prop_Cased1_index[21] = { - 0xb9, 0x02, 0xe0, 0xc0, 0x1d, 0x20, 0xe5, 0x2c, - 0x20, 0xb1, 0x07, 0x21, 0xc1, 0xd6, 0x21, 0x4a, - 0xf1, 0x01, 0x8a, 0xf1, 0x01, + 0xb9, 0x02, 0xe0, // 002B9 at 39 + 0xc0, 0x1d, 0x20, // 01DC0 at 65 + 0xe5, 0x2c, 0x20, // 02CE5 at 97 + 0xb1, 0x07, 0x21, // 107B1 at 129 + 0xc1, 0xd6, 0x21, // 1D6C1 at 161 + 0x4a, 0xf1, 0x01, // 1F14A at 192 + 0x8a, 0xf1, 0x01, // 1F18A at 224 (upper bound) }; static const uint8_t unicode_prop_Case_Ignorable_table[737] = { @@ -291,15 +295,29 @@ static const uint8_t unicode_prop_Case_Ignorable_table[737] = { }; static const uint8_t unicode_prop_Case_Ignorable_index[69] = { - 0xbe, 0x05, 0x00, 0xfe, 0x07, 0x00, 0x52, 0x0a, - 0xa0, 0xc1, 0x0b, 0x00, 0x82, 0x0d, 0x00, 0x3f, - 0x10, 0x80, 0xd4, 0x17, 0x40, 0xcf, 0x1a, 0x20, - 0xf5, 0x1c, 0x00, 0x80, 0x20, 0x00, 0x16, 0xa0, - 0x00, 0xc6, 0xa8, 0x00, 0xc2, 0xaa, 0x60, 0x56, - 0xfe, 0x20, 0xb1, 0x07, 0x01, 0x75, 0x10, 0x01, - 0xeb, 0x12, 0x21, 0x41, 0x16, 0x01, 0x5c, 0x1a, - 0x01, 0x43, 0x1f, 0x01, 0x2e, 0xcf, 0x41, 0x25, - 0xe0, 0x01, 0xf0, 0x01, 0x0e, + 0xbe, 0x05, 0x00, // 005BE at 32 + 0xfe, 0x07, 0x00, // 007FE at 64 + 0x52, 0x0a, 0xa0, // 00A52 at 101 + 0xc1, 0x0b, 0x00, // 00BC1 at 128 + 0x82, 0x0d, 0x00, // 00D82 at 160 + 0x3f, 0x10, 0x80, // 0103F at 196 + 0xd4, 0x17, 0x40, // 017D4 at 226 + 0xcf, 0x1a, 0x20, // 01ACF at 257 + 0xf5, 0x1c, 0x00, // 01CF5 at 288 + 0x80, 0x20, 0x00, // 02080 at 320 + 0x16, 0xa0, 0x00, // 0A016 at 352 + 0xc6, 0xa8, 0x00, // 0A8C6 at 384 + 0xc2, 0xaa, 0x60, // 0AAC2 at 419 + 0x56, 0xfe, 0x20, // 0FE56 at 449 + 0xb1, 0x07, 0x01, // 107B1 at 480 + 0x75, 0x10, 0x01, // 11075 at 512 + 0xeb, 0x12, 0x21, // 112EB at 545 + 0x41, 0x16, 0x01, // 11641 at 576 + 0x5c, 0x1a, 0x01, // 11A5C at 608 + 0x43, 0x1f, 0x01, // 11F43 at 640 + 0x2e, 0xcf, 0x41, // 1CF2E at 674 + 0x25, 0xe0, 0x01, // 1E025 at 704 + 0xf0, 0x01, 0x0e, // E01F0 at 736 (upper bound) }; static const uint8_t unicode_prop_ID_Start_table[1100] = { @@ -444,20 +462,41 @@ static const uint8_t unicode_prop_ID_Start_table[1100] = { }; static const uint8_t unicode_prop_ID_Start_index[105] = { - 0xf6, 0x03, 0x20, 0xa6, 0x07, 0x00, 0xa9, 0x09, - 0x20, 0xb1, 0x0a, 0x00, 0xba, 0x0b, 0x20, 0x3b, - 0x0d, 0x20, 0xc7, 0x0e, 0x20, 0x49, 0x12, 0x00, - 0x9b, 0x16, 0x00, 0xac, 0x19, 0x00, 0xc0, 0x1d, - 0x80, 0x80, 0x20, 0x20, 0x70, 0x2d, 0x00, 0x00, - 0x32, 0x00, 0xda, 0xa7, 0x00, 0x4c, 0xaa, 0x20, - 0xc7, 0xd7, 0x20, 0xfc, 0xfd, 0x20, 0x9d, 0x02, - 0x21, 0x96, 0x05, 0x01, 0xf3, 0x08, 0x01, 0xb3, - 0x0c, 0x21, 0x73, 0x11, 0x61, 0x34, 0x13, 0x01, - 0x1b, 0x17, 0x21, 0x8a, 0x1a, 0x01, 0x34, 0x1f, - 0x21, 0xbf, 0x6a, 0x01, 0x23, 0xb1, 0xa1, 0xad, - 0xd4, 0x01, 0x6f, 0xd7, 0x01, 0xff, 0xe7, 0x61, - 0x5e, 0xee, 0x01, 0xe1, 0xeb, 0x22, 0xb0, 0x23, - 0x03, + 0xf6, 0x03, 0x20, // 003F6 at 33 + 0xa6, 0x07, 0x00, // 007A6 at 64 + 0xa9, 0x09, 0x20, // 009A9 at 97 + 0xb1, 0x0a, 0x00, // 00AB1 at 128 + 0xba, 0x0b, 0x20, // 00BBA at 161 + 0x3b, 0x0d, 0x20, // 00D3B at 193 + 0xc7, 0x0e, 0x20, // 00EC7 at 225 + 0x49, 0x12, 0x00, // 01249 at 256 + 0x9b, 0x16, 0x00, // 0169B at 288 + 0xac, 0x19, 0x00, // 019AC at 320 + 0xc0, 0x1d, 0x80, // 01DC0 at 356 + 0x80, 0x20, 0x20, // 02080 at 385 + 0x70, 0x2d, 0x00, // 02D70 at 416 + 0x00, 0x32, 0x00, // 03200 at 448 + 0xda, 0xa7, 0x00, // 0A7DA at 480 + 0x4c, 0xaa, 0x20, // 0AA4C at 513 + 0xc7, 0xd7, 0x20, // 0D7C7 at 545 + 0xfc, 0xfd, 0x20, // 0FDFC at 577 + 0x9d, 0x02, 0x21, // 1029D at 609 + 0x96, 0x05, 0x01, // 10596 at 640 + 0xf3, 0x08, 0x01, // 108F3 at 672 + 0xb3, 0x0c, 0x21, // 10CB3 at 705 + 0x73, 0x11, 0x61, // 11173 at 739 + 0x34, 0x13, 0x01, // 11334 at 768 + 0x1b, 0x17, 0x21, // 1171B at 801 + 0x8a, 0x1a, 0x01, // 11A8A at 832 + 0x34, 0x1f, 0x21, // 11F34 at 865 + 0xbf, 0x6a, 0x01, // 16ABF at 896 + 0x23, 0xb1, 0xa1, // 1B123 at 933 + 0xad, 0xd4, 0x01, // 1D4AD at 960 + 0x6f, 0xd7, 0x01, // 1D76F at 992 + 0xff, 0xe7, 0x61, // 1E7FF at 1027 + 0x5e, 0xee, 0x01, // 1EE5E at 1056 + 0xe1, 0xeb, 0x22, // 2EBE1 at 1089 + 0xb0, 0x23, 0x03, // 323B0 at 1120 (upper bound) }; static const uint8_t unicode_prop_ID_Continue1_table[660] = { @@ -547,14 +586,27 @@ static const uint8_t unicode_prop_ID_Continue1_table[660] = { }; static const uint8_t unicode_prop_ID_Continue1_index[63] = { - 0xfa, 0x06, 0x00, 0x70, 0x09, 0x00, 0xf0, 0x0a, - 0x40, 0x57, 0x0c, 0x00, 0xf0, 0x0d, 0x60, 0xc7, - 0x0f, 0x20, 0xea, 0x17, 0x40, 0x05, 0x1b, 0x00, - 0x41, 0x20, 0x00, 0x0c, 0xa8, 0x80, 0x37, 0xaa, - 0x20, 0x50, 0xfe, 0x20, 0x3a, 0x0d, 0x21, 0x74, - 0x11, 0x01, 0x5a, 0x14, 0x21, 0x44, 0x19, 0x81, - 0x5a, 0x1d, 0xa1, 0xf5, 0x6a, 0x21, 0x45, 0xd2, - 0x41, 0xaf, 0xe2, 0x21, 0xf0, 0x01, 0x0e, + 0xfa, 0x06, 0x00, // 006FA at 32 + 0x70, 0x09, 0x00, // 00970 at 64 + 0xf0, 0x0a, 0x40, // 00AF0 at 98 + 0x57, 0x0c, 0x00, // 00C57 at 128 + 0xf0, 0x0d, 0x60, // 00DF0 at 163 + 0xc7, 0x0f, 0x20, // 00FC7 at 193 + 0xea, 0x17, 0x40, // 017EA at 226 + 0x05, 0x1b, 0x00, // 01B05 at 256 + 0x41, 0x20, 0x00, // 02041 at 288 + 0x0c, 0xa8, 0x80, // 0A80C at 324 + 0x37, 0xaa, 0x20, // 0AA37 at 353 + 0x50, 0xfe, 0x20, // 0FE50 at 385 + 0x3a, 0x0d, 0x21, // 10D3A at 417 + 0x74, 0x11, 0x01, // 11174 at 448 + 0x5a, 0x14, 0x21, // 1145A at 481 + 0x44, 0x19, 0x81, // 11944 at 516 + 0x5a, 0x1d, 0xa1, // 11D5A at 549 + 0xf5, 0x6a, 0x21, // 16AF5 at 577 + 0x45, 0xd2, 0x41, // 1D245 at 610 + 0xaf, 0xe2, 0x21, // 1E2AF at 641 + 0xf0, 0x01, 0x0e, // E01F0 at 672 (upper bound) }; #ifdef CONFIG_ALL_UNICODE @@ -676,17 +728,35 @@ static const uint8_t unicode_cc_table[899] = { }; static const uint8_t unicode_cc_index[87] = { - 0x4d, 0x03, 0x00, 0x97, 0x05, 0x20, 0xc6, 0x05, - 0x00, 0xe7, 0x06, 0x00, 0x45, 0x07, 0x00, 0x9c, - 0x08, 0x00, 0x4d, 0x09, 0x00, 0x3c, 0x0b, 0x00, - 0x3d, 0x0d, 0x00, 0x36, 0x0f, 0x00, 0x38, 0x10, - 0x20, 0x3a, 0x19, 0x00, 0xcb, 0x1a, 0x20, 0xd3, - 0x1c, 0x00, 0xcf, 0x1d, 0x00, 0xe2, 0x20, 0x00, - 0x2e, 0x30, 0x20, 0x2b, 0xa9, 0x20, 0xed, 0xab, - 0x00, 0x39, 0x0a, 0x01, 0x51, 0x0f, 0x01, 0x73, - 0x11, 0x01, 0x75, 0x13, 0x01, 0x2b, 0x17, 0x21, - 0x3f, 0x1c, 0x21, 0x9e, 0xbc, 0x21, 0x08, 0xe0, - 0x01, 0x44, 0xe9, 0x01, 0x4b, 0xe9, 0x01, + 0x4d, 0x03, 0x00, // 0034D at 32 + 0x97, 0x05, 0x20, // 00597 at 65 + 0xc6, 0x05, 0x00, // 005C6 at 96 + 0xe7, 0x06, 0x00, // 006E7 at 128 + 0x45, 0x07, 0x00, // 00745 at 160 + 0x9c, 0x08, 0x00, // 0089C at 192 + 0x4d, 0x09, 0x00, // 0094D at 224 + 0x3c, 0x0b, 0x00, // 00B3C at 256 + 0x3d, 0x0d, 0x00, // 00D3D at 288 + 0x36, 0x0f, 0x00, // 00F36 at 320 + 0x38, 0x10, 0x20, // 01038 at 353 + 0x3a, 0x19, 0x00, // 0193A at 384 + 0xcb, 0x1a, 0x20, // 01ACB at 417 + 0xd3, 0x1c, 0x00, // 01CD3 at 448 + 0xcf, 0x1d, 0x00, // 01DCF at 480 + 0xe2, 0x20, 0x00, // 020E2 at 512 + 0x2e, 0x30, 0x20, // 0302E at 545 + 0x2b, 0xa9, 0x20, // 0A92B at 577 + 0xed, 0xab, 0x00, // 0ABED at 608 + 0x39, 0x0a, 0x01, // 10A39 at 640 + 0x51, 0x0f, 0x01, // 10F51 at 672 + 0x73, 0x11, 0x01, // 11173 at 704 + 0x75, 0x13, 0x01, // 11375 at 736 + 0x2b, 0x17, 0x21, // 1172B at 769 + 0x3f, 0x1c, 0x21, // 11C3F at 801 + 0x9e, 0xbc, 0x21, // 1BC9E at 833 + 0x08, 0xe0, 0x01, // 1E008 at 864 + 0x44, 0xe9, 0x01, // 1E944 at 896 + 0x4b, 0xe9, 0x01, // 1E94B at 928 (upper bound) }; static const uint32_t unicode_decomp_table1[699] = { @@ -3779,72 +3849,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = { 0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80, }; -static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = { - 0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44, - 0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f, - 0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80, - 0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b, - 0x84, +static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = { + 0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80, + 0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, + 0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81, + 0x89, 0x10, 0x81, 0x8d, 0x80, }; -static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = { +static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = { 0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12, - 0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b, - 0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80, - 0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, - 0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, - 0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80, - 0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, - 0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, - 0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, - 0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, - 0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, - 0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, - 0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, - 0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, - 0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, - 0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87, - 0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80, - 0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08, - 0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, - 0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, - 0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, - 0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, - 0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, - 0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, - 0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, - 0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, - 0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, - 0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, - 0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, - 0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, - 0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, - 0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, - 0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, - 0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, - 0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00, - 0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91, - 0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95, - 0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00, - 0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40, - 0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80, - 0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60, - 0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87, - 0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01, - 0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80, - 0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04, - 0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23, - 0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18, - 0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00, - 0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b, - 0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90, - 0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84, - 0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee, - 0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d, - 0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05, - 0xe1, 0x4f, 0xff, + 0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84, + 0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb, + 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89, + 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80, + 0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28, + 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08, + 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40, + 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7, + 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03, + 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80, + 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b, + 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52, + 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80, + 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40, + 0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80, + 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c, + 0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f, + 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00, + 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80, + 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01, + 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05, + 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40, + 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34, + 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82, + 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80, + 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5, + 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80, + 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e, + 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60, + 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80, + 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60, + 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89, + 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2, + 0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb, + 0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7, + 0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80, + 0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08, + 0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d, + 0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20, + 0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54, + 0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e, + 0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80, + 0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06, + 0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41, + 0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f, + 0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08, + 0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00, + 0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00, + 0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90, + 0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99, + 0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83, + 0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05, + 0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff, }; static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = { @@ -4486,3 +4554,4 @@ static const uint16_t unicode_prop_len_table[] = { }; #endif /* CONFIG_ALL_UNICODE */ +/* 62 tables / 32261 bytes, 5 index / 345 bytes */ diff --git a/libregexp/libunicode.c b/libregexp/libunicode.c index 63c12a0..c80d2f3 100644 --- a/libregexp/libunicode.c +++ b/libregexp/libunicode.c @@ -1,6 +1,6 @@ /* * Unicode utilities - * + * * Copyright (c) 2017-2018 Fabrice Bellard * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -43,15 +43,115 @@ enum { RUN_TYPE_UF_D1_EXT, RUN_TYPE_U_EXT, RUN_TYPE_LF_EXT, - RUN_TYPE_U_EXT2, - RUN_TYPE_L_EXT2, - RUN_TYPE_U_EXT3, + RUN_TYPE_UF_EXT2, + RUN_TYPE_LF_EXT2, + RUN_TYPE_UF_EXT3, }; +static int lre_case_conv1(uint32_t c, int conv_type) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + lre_case_conv(res, c, conv_type); + return res[0]; +} + +/* case conversion using the table entry 'idx' with value 'v' */ +static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v) +{ + uint32_t code, data, type, a, is_lower; + is_lower = (conv_type != 0); + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + data = ((v & 0xf) << 8) | case_conv_table2[idx]; + code = v >> (32 - 17); + switch(type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + if (conv_type == (type & 1) || + (type >= RUN_TYPE_UF && conv_type == 2)) { + c = c - code + (case_conv_table1[data] >> (32 - 17)); + } + break; + case RUN_TYPE_UL: + a = c - code; + if ((a & 1) != (1 - is_lower)) + break; + c = (a ^ 1) + code; + break; + case RUN_TYPE_LSU: + a = c - code; + if (a == 1) { + c += 2 * is_lower - 1; + } else if (a == (1 - is_lower) * 2) { + c += (2 * is_lower - 1) * 2; + } + break; + case RUN_TYPE_U2L_399_EXT2: + if (!is_lower) { + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = 0x399; + return 2; + } else { + c = c - code + case_conv_ext[data & 0x3f]; + } + break; + case RUN_TYPE_UF_D20: + if (conv_type == 1) + break; + c = data + (conv_type == 2) * 0x20; + break; + case RUN_TYPE_UF_D1_EXT: + if (conv_type == 1) + break; + c = case_conv_ext[data] + (conv_type == 2); + break; + case RUN_TYPE_U_EXT: + case RUN_TYPE_LF_EXT: + if (is_lower != (type - RUN_TYPE_U_EXT)) + break; + c = case_conv_ext[data]; + break; + case RUN_TYPE_LF_EXT2: + if (!is_lower) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + return 2; + case RUN_TYPE_UF_EXT2: + if (conv_type == 1) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + } + return 2; + default: + case RUN_TYPE_UF_EXT3: + if (conv_type == 1) + break; + res[0] = case_conv_ext[data >> 8]; + res[1] = case_conv_ext[(data >> 4) & 0xf]; + res[2] = case_conv_ext[data & 0xf]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + res[2] = lre_case_conv1(res[2], 1); + } + return 3; + } + res[0] = c; + return 1; +} + /* conv_type: - 0 = to upper + 0 = to upper 1 = to lower - 2 = case folding (= to lower with modifications) + 2 = case folding (= to lower with modifications) */ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) { @@ -66,10 +166,9 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } } } else { - uint32_t v, code, data, type, len, a, is_lower; + uint32_t v, code, len; int idx, idx_min, idx_max; - - is_lower = (conv_type != 0); + idx_min = 0; idx_max = countof(case_conv_table1) - 1; while (idx_min <= idx_max) { @@ -82,74 +181,7 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } else if (c >= code + len) { idx_min = idx + 1; } else { - type = (v >> (32 - 17 - 7 - 4)) & 0xf; - data = ((v & 0xf) << 8) | case_conv_table2[idx]; - switch(type) { - case RUN_TYPE_U: - case RUN_TYPE_L: - case RUN_TYPE_UF: - case RUN_TYPE_LF: - if (conv_type == (type & 1) || - (type >= RUN_TYPE_UF && conv_type == 2)) { - c = c - code + (case_conv_table1[data] >> (32 - 17)); - } - break; - case RUN_TYPE_UL: - a = c - code; - if ((a & 1) != (1 - is_lower)) - break; - c = (a ^ 1) + code; - break; - case RUN_TYPE_LSU: - a = c - code; - if (a == 1) { - c += 2 * is_lower - 1; - } else if (a == (1 - is_lower) * 2) { - c += (2 * is_lower - 1) * 2; - } - break; - case RUN_TYPE_U2L_399_EXT2: - if (!is_lower) { - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = 0x399; - return 2; - } else { - c = c - code + case_conv_ext[data & 0x3f]; - } - break; - case RUN_TYPE_UF_D20: - if (conv_type == 1) - break; - c = data + (conv_type == 2) * 0x20; - break; - case RUN_TYPE_UF_D1_EXT: - if (conv_type == 1) - break; - c = case_conv_ext[data] + (conv_type == 2); - break; - case RUN_TYPE_U_EXT: - case RUN_TYPE_LF_EXT: - if (is_lower != (type - RUN_TYPE_U_EXT)) - break; - c = case_conv_ext[data]; - break; - case RUN_TYPE_U_EXT2: - case RUN_TYPE_L_EXT2: - if (conv_type != (type - RUN_TYPE_U_EXT2)) - break; - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = case_conv_ext[data & 0x3f]; - return 2; - default: - case RUN_TYPE_U_EXT3: - if (conv_type != 0) - break; - res[0] = case_conv_ext[data >> 8]; - res[1] = case_conv_ext[(data >> 4) & 0xf]; - res[2] = case_conv_ext[data & 0xf]; - return 3; - } - break; + return lre_case_conv_entry(res, c, conv_type, idx, v); } } } @@ -157,13 +189,80 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) return 1; } +static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + + if (is_unicode) { + len = lre_case_conv_entry(res, c, 2, idx, v); + if (len == 1) { + c = res[0]; + } else { + /* handle the few specific multi-character cases (see + unicode_gen.c:dump_case_folding_special_cases()) */ + if (c == 0xfb06) { + c = 0xfb05; + } else if (c == 0x01fd3) { + c = 0x390; + } else if (c == 0x01fe3) { + c = 0x3b0; + } + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv_entry(res, c, FALSE, idx, v); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +/* JS regexp specific rules for case folding */ +int lre_canonicalize(uint32_t c, BOOL is_unicode) +{ + if (c < 128) { + /* fast case */ + if (is_unicode) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return lre_case_folding_entry(c, idx, v, is_unicode); + } + } + } + return c; +} + static uint32_t get_le24(const uint8_t *ptr) { -#if defined(__x86__) || defined(__x86_64__) - return *(uint16_t *)ptr | (ptr[2] << 16); -#else return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); -#endif } #define UNICODE_INDEX_BLOCK_LEN 32 @@ -208,12 +307,20 @@ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table, uint32_t code, b, bit; int pos; const uint8_t *p; - + pos = get_index_pos(&code, c, index_table, index_table_len); if (pos < 0) return FALSE; /* outside the table */ p = table + pos; bit = 0; + /* Compressed run length encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ for(;;) { b = *p++; if (b < 64) { @@ -241,7 +348,7 @@ BOOL lre_is_cased(uint32_t c) { uint32_t v, code, len; int idx, idx_min, idx_max; - + idx_min = 0; idx_max = countof(case_conv_table1) - 1; while (idx_min <= idx_max) { @@ -300,7 +407,7 @@ int cr_realloc(CharRange *cr, int size) { int new_size; uint32_t *new_buf; - + if (size > cr->size) { new_size = max_int(size, cr->size * 3 / 2); new_buf = cr->realloc_func(cr->mem_opaque, cr->points, @@ -327,7 +434,7 @@ static void cr_compress(CharRange *cr) { int i, j, k, len; uint32_t *pt; - + pt = cr->points; len = cr->len; i = 0; @@ -357,7 +464,7 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, { int a_idx, b_idx, is_in; uint32_t v; - + a_idx = 0; b_idx = 0; for(;;) { @@ -658,7 +765,7 @@ static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1) { uint32_t v, type, is_compat, code, len; int idx_min, idx_max, idx; - + idx_min = 0; idx_max = countof(unicode_decomp_table1) - 1; while (idx_min <= idx_max) { @@ -688,7 +795,7 @@ static int unicode_compose_pair(uint32_t c0, uint32_t c1) uint32_t code, len, type, v, idx1, d_idx, d_offset, ch; int idx_min, idx_max, idx, d; uint32_t pair[2]; - + idx_min = 0; idx_max = countof(unicode_comp_table) - 1; while (idx_min <= idx_max) { @@ -724,12 +831,19 @@ static int unicode_get_cc(uint32_t c) uint32_t code, n, type, cc, c1, b; int pos; const uint8_t *p; - + pos = get_index_pos(&code, c, unicode_cc_index, sizeof(unicode_cc_index) / 3); if (pos < 0) return 0; p = unicode_cc_table + pos; + /* Compressed run length encoding: + - 2 high order bits are combining class type + - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte + - 00..2F: range length (add 1) + - 30..37: 3-bit range-length + 1 extra byte + - 38..3F: 3-bit range-length + 2 extra byte + */ for(;;) { b = *p++; type = b >> 6; @@ -773,7 +887,7 @@ static int unicode_get_cc(uint32_t c) static void sort_cc(int *buf, int len) { int i, j, k, cc, cc1, start, ch1; - + for(i = 0; i < len; i++) { cc = unicode_get_cc(buf[i]); if (cc != 0) { @@ -812,7 +926,7 @@ static void to_nfd_rec(DynBuf *dbuf, uint32_t c, v; int i, l; uint32_t res[UNICODE_DECOMP_LEN_MAX]; - + for(i = 0; i < src_len; i++) { c = src[i]; if (c >= 0xac00 && c < 0xd7a4) { @@ -857,7 +971,7 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len; BOOL is_compat; DynBuf dbuf_s, *dbuf = &dbuf_s; - + is_compat = n_type >> 1; dbuf_init2(dbuf, opaque, realloc_func); @@ -885,15 +999,15 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, } buf = (int *)dbuf->buf; buf_len = dbuf->size / sizeof(int); - + sort_cc(buf, buf_len); - + if (buf_len <= 1 || (n_type & 1) != 0) { /* NFD / NFKD */ *pdst = (uint32_t *)buf; return buf_len; } - + i = 1; out_len = 1; while (i < buf_len) { @@ -930,7 +1044,7 @@ static int unicode_find_name(const char *name_table, const char *name) const char *p, *r; int pos; size_t name_len, len; - + p = name_table; pos = 0; name_len = strlen(name); @@ -963,13 +1077,13 @@ int unicode_script(CharRange *cr, CharRange cr1_s, *cr1; CharRange cr2_s, *cr2 = &cr2_s; BOOL is_common; - + script_idx = unicode_find_name(unicode_script_name_table, script_name); if (script_idx < 0) return -2; /* Note: we remove the "Unknown" Script */ script_idx += UNICODE_SCRIPT_Unknown + 1; - + is_common = (script_idx == UNICODE_SCRIPT_Common || script_idx == UNICODE_SCRIPT_Inherited); if (is_ext) { @@ -1082,6 +1196,15 @@ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask) p = unicode_gc_table; p_end = unicode_gc_table + countof(unicode_gc_table); c = 0; + /* Compressed range encoding: + initial byte: + bits 0..4: category number (special case 31) + bits 5..7: range length (add 1) + special case bits 5..7 == 7: read an extra byte + - 00..7F: range length (add 7 + 1) + - 80..BF: 6-bits plus extra byte for range length (add 7 + 128) + - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384) + */ while (p < p_end) { b = *p++; n = b >> 5; @@ -1135,6 +1258,14 @@ static int unicode_prop1(CharRange *cr, int prop_idx) p_end = p + unicode_prop_len_table[prop_idx]; c = 0; bit = 0; + /* Compressed range encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ while (p < p_end) { c0 = c; b = *p++; @@ -1179,11 +1310,11 @@ static int unicode_case1(CharRange *cr, int case_mask) #define MR(x) (1 << RUN_TYPE_ ## x) const uint32_t tab_run_mask[3] = { MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | - MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3), + MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3), - MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2), + MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2), - MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT), + MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3), }; #undef MR uint32_t mask, v, code, type, len, i, idx; @@ -1236,7 +1367,136 @@ static int unicode_case1(CharRange *cr, int case_mask) } return 0; } - + +static int point_cmp(const void *p1, const void *p2, void *arg) +{ + uint32_t v1 = *(uint32_t *)p1; + uint32_t v2 = *(uint32_t *)p2; + return (v1 > v2) - (v1 < v2); +} + +static void cr_sort_and_remove_overlap(CharRange *cr) +{ + uint32_t start, end, start1, end1, i, j; + + /* the resulting ranges are not necessarily sorted and may overlap */ + rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL); + j = 0; + for(i = 0; i < cr->len; ) { + start = cr->points[i]; + end = cr->points[i + 1]; + i += 2; + while (i < cr->len) { + start1 = cr->points[i]; + end1 = cr->points[i + 1]; + if (start1 > end) { + /* |------| + * |-------| */ + break; + } else if (end1 <= end) { + /* |------| + * |--| */ + i += 2; + } else { + /* |------| + * |-------| */ + end = end1; + i += 2; + } + } + cr->points[j] = start; + cr->points[j + 1] = end; + j += 2; + } + cr->len = j; +} + +/* canonicalize a character set using the JS regex case folding rules + (see lre_canonicalize()) */ +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode) +{ + CharRange cr_inter, cr_mask, cr_result, cr_sub; + uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d; + + cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_result, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func); + + if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U)) + goto fail; + if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + if (cr_invert(&cr_mask)) + goto fail; + if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + /* cr_inter = cr & cr_mask */ + /* cr_sub = cr & ~cr_mask */ + + /* use the case conversion table to compute the result */ + d_start = -1; + d_end = -1; + idx = 0; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + for(i = 0; i < cr_inter.len; i += 2) { + start = cr_inter.points[i]; + end = cr_inter.points[i + 1]; + + for(c = start; c < end; c++) { + for(;;) { + if (c >= code && c < code + len) + break; + idx++; + assert(idx < countof(case_conv_table1)); + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + } + d = lre_case_folding_entry(c, idx, v, is_unicode); + /* try to merge with the current interval */ + if (d_start == -1) { + d_start = d; + d_end = d + 1; + } else if (d_end == d) { + d_end++; + } else { + cr_add_interval(&cr_result, d_start, d_end); + d_start = d; + d_end = d + 1; + } + } + } + if (d_start != -1) { + if (cr_add_interval(&cr_result, d_start, d_end)) + goto fail; + } + + /* the resulting ranges are not necessarily sorted and may overlap */ + cr_sort_and_remove_overlap(&cr_result); + + /* or with the character not affected by the case folding */ + cr->len = 0; + if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION)) + goto fail; + + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return 0; + fail: + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return -1; +} + typedef enum { POP_GC, POP_PROP, @@ -1256,7 +1516,7 @@ static int unicode_prop_ops(CharRange *cr, ...) CharRange stack[POP_STACK_LEN_MAX]; int stack_len, op, ret, i; uint32_t a; - + va_start(ap, cr); stack_len = 0; for(;;) { @@ -1342,7 +1602,7 @@ int unicode_general_category(CharRange *cr, const char *gc_name) { int gc_idx; uint32_t gc_mask; - + gc_idx = unicode_find_name(unicode_gc_name_table, gc_name); if (gc_idx < 0) return -2; @@ -1360,7 +1620,7 @@ int unicode_general_category(CharRange *cr, const char *gc_name) int unicode_prop(CharRange *cr, const char *prop_name) { int prop_idx, ret; - + prop_idx = unicode_find_name(unicode_prop_name_table, prop_name); if (prop_idx < 0) return -2; @@ -1554,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name) } #endif /* CONFIG_ALL_UNICODE */ + +/*---- lre codepoint categorizing functions ----*/ + +#define S UNICODE_C_SPACE +#define D UNICODE_C_DIGIT +#define X UNICODE_C_XDIGIT +#define U UNICODE_C_UPPER +#define L UNICODE_C_LOWER +#define _ UNICODE_C_UNDER +#define d UNICODE_C_DOLLAR + +uint8_t const lre_ctype_bits[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, S, S, S, S, S, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + S, 0, 0, 0, d, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D, + X|D, X|D, 0, 0, 0, 0, 0, 0, + + 0, X|U, X|U, X|U, X|U, X|U, X|U, U, + U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, + U, U, U, 0, 0, 0, 0, _, + + 0, X|L, X|L, X|L, X|L, X|L, X|L, L, + L, L, L, L, L, L, L, L, + L, L, L, L, L, L, L, L, + L, L, L, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + S, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +#undef S +#undef D +#undef X +#undef U +#undef L +#undef _ +#undef d + +/* code point ranges for Zs,Zl or Zp property */ +static const uint16_t char_range_s[] = { + 10, + 0x0009, 0x000D + 1, + 0x0020, 0x0020 + 1, + 0x00A0, 0x00A0 + 1, + 0x1680, 0x1680 + 1, + 0x2000, 0x200A + 1, + /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ + /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ + 0x2028, 0x2029 + 1, + 0x202F, 0x202F + 1, + 0x205F, 0x205F + 1, + 0x3000, 0x3000 + 1, + /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ + 0xFEFF, 0xFEFF + 1, +}; + +BOOL lre_is_space_non_ascii(uint32_t c) +{ + size_t i, n; + + n = countof(char_range_s); + for(i = 5; i < n; i += 2) { + uint32_t low = char_range_s[i]; + uint32_t high = char_range_s[i + 1]; + if (c < low) + return FALSE; + if (c < high) + return TRUE; + } + return FALSE; +} diff --git a/libregexp/libunicode.h b/libregexp/libunicode.h index cfa600a..cc2f244 100644 --- a/libregexp/libunicode.h +++ b/libregexp/libunicode.h @@ -1,6 +1,6 @@ /* * Unicode utilities - * + * * Copyright (c) 2017-2018 Fabrice Bellard * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -24,26 +24,13 @@ #ifndef LIBUNICODE_H #define LIBUNICODE_H -#include - -#define LRE_BOOL int /* for documentation purposes */ +#include /* define it to include all the unicode tables (40KB larger) */ #define CONFIG_ALL_UNICODE #define LRE_CC_RES_LEN_MAX 3 -typedef enum { - UNICODE_NFC, - UNICODE_NFD, - UNICODE_NFKC, - UNICODE_NFKD, -} UnicodeNormalizationEnum; - -int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); -LRE_BOOL lre_is_cased(uint32_t c); -LRE_BOOL lre_is_case_ignorable(uint32_t c); - /* char ranges */ typedef struct { @@ -101,10 +88,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, int cr_invert(CharRange *cr); -#ifdef CONFIG_ALL_UNICODE +int cr_regexp_canonicalize(CharRange *cr, int is_unicode); -LRE_BOOL lre_is_id_start(uint32_t c); -LRE_BOOL lre_is_id_continue(uint32_t c); +typedef enum { + UNICODE_NFC, + UNICODE_NFD, + UNICODE_NFKC, + UNICODE_NFKD, +} UnicodeNormalizationEnum; int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, UnicodeNormalizationEnum n_type, @@ -112,13 +103,80 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, /* Unicode character range functions */ -int unicode_script(CharRange *cr, - const char *script_name, LRE_BOOL is_ext); +int unicode_script(CharRange *cr, const char *script_name, int is_ext); int unicode_general_category(CharRange *cr, const char *gc_name); int unicode_prop(CharRange *cr, const char *prop_name); -#endif /* CONFIG_ALL_UNICODE */ +int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); +int lre_canonicalize(uint32_t c, int is_unicode); + +/* Code point type categories */ +enum { + UNICODE_C_SPACE = (1 << 0), + UNICODE_C_DIGIT = (1 << 1), + UNICODE_C_UPPER = (1 << 2), + UNICODE_C_LOWER = (1 << 3), + UNICODE_C_UNDER = (1 << 4), + UNICODE_C_DOLLAR = (1 << 5), + UNICODE_C_XDIGIT = (1 << 6), +}; +extern uint8_t const lre_ctype_bits[256]; + +/* zero or non-zero return value */ +int lre_is_cased(uint32_t c); +int lre_is_case_ignorable(uint32_t c); +int lre_is_id_start(uint32_t c); +int lre_is_id_continue(uint32_t c); + +static inline int lre_is_space_byte(uint8_t c) { + return lre_ctype_bits[c] & UNICODE_C_SPACE; +} + +static inline int lre_is_id_start_byte(uint8_t c) { + return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | + UNICODE_C_UNDER | UNICODE_C_DOLLAR); +} -#undef LRE_BOOL +static inline int lre_is_id_continue_byte(uint8_t c) { + return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | + UNICODE_C_UNDER | UNICODE_C_DOLLAR | + UNICODE_C_DIGIT); +} + +int lre_is_space_non_ascii(uint32_t c); + +static inline int lre_is_space(uint32_t c) { + if (c < 256) + return lre_is_space_byte(c); + else + return lre_is_space_non_ascii(c); +} + +static inline int lre_js_is_ident_first(uint32_t c) { + if (c < 128) { + return lre_is_id_start_byte(c); + } else { +#ifdef CONFIG_ALL_UNICODE + return lre_is_id_start(c); +#else + return !lre_is_space_non_ascii(c); +#endif + } +} + +static inline int lre_js_is_ident_next(uint32_t c) { + if (c < 128) { + return lre_is_id_continue_byte(c); + } else { + /* ZWNJ and ZWJ are accepted in identifiers */ + if (c >= 0x200C && c <= 0x200D) + return TRUE; +#ifdef CONFIG_ALL_UNICODE + return lre_is_id_continue(c); +#else + return !lre_is_space_non_ascii(c); +#endif + } +} #endif /* LIBUNICODE_H */ From 5f64d0ac2b409fd8feccd22e2401e6e438135a94 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Tue, 25 Jun 2024 16:31:51 +0200 Subject: [PATCH 5/8] implement d flag --- jsregexp.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/jsregexp.c b/jsregexp.c index e01d2be..9452cea 100644 --- a/jsregexp.c +++ b/jsregexp.c @@ -335,6 +335,7 @@ static int regexp_gc(lua_State *lstate) { static void regexp_pushflags(lua_State *lstate, const struct regexp *r) { const int flags = lre_get_flags(r->bc); + const char *indices = (flags & LRE_FLAG_INDICES) ? "d" : ""; const char *ignorecase = (flags & LRE_FLAG_IGNORECASE) ? "i" : ""; const char *global = (flags & LRE_FLAG_GLOBAL) ? "g" : ""; const char *multiline = (flags & LRE_FLAG_MULTILINE) ? "m" : ""; @@ -342,8 +343,8 @@ static void regexp_pushflags(lua_State *lstate, const struct regexp *r) { const char *dotall = (flags & LRE_FLAG_DOTALL) ? "s" : ""; const char *utf16 = (flags & LRE_FLAG_UNICODE) ? "u" : ""; const char *sticky = (flags & LRE_FLAG_STICKY) ? "y" : ""; - lua_pushfstring(lstate, "%s%s%s%s%s%s%s", ignorecase, global, multiline, - named_groups, dotall, utf16, sticky); + lua_pushfstring(lstate, "%s%s%s%s%s%s%s%s", indices, ignorecase, global, + multiline, named_groups, dotall, utf16, sticky); } static int regexp_tostring(lua_State *lstate) { @@ -394,6 +395,7 @@ static int regexp_exec(lua_State *lstate) { const int capture_count = lre_get_capture_count(r->bc); const char *group_names = lre_get_groupnames(r->bc); + const bool has_indices = lre_get_flags(r->bc) & LRE_FLAG_INDICES; const int ret = lre_exec(capture, r->bc, (uint8_t *)input->u.str8, rlast_index, @@ -438,34 +440,94 @@ static int regexp_exec(lua_State *lstate) { } lua_setfield(lstate, -2, "index"); + if (has_indices) { + // [match] + lua_createtable(lstate, capture_count + 1, 0); // match.indices + // [match, indices] + if (group_names) { + // push indices.groups table, duplicate it and leave it below match + lua_createtable(lstate, 0, capture_count); // match.indices.groups + // [match, indices, groups] + lua_pushvalue(lstate, -1); + // [match, indices, groups, groups] + lua_insert(lstate, -4); + // [indices.groups, match, indices, groups] + lua_setfield(lstate, -2, "groups"); + // [indices.groups, match, indices] + } + lua_pushvalue(lstate, -1); + // [..., match, indices, indices] + lua_setfield(lstate, -3, "indices"); + // [..., match, indices] + lua_insert(lstate, -2); // leave table below the match table + // [..., indices, match] + } + if (group_names) { + // [..., match] lua_newtable(lstate); // match.groups + // [..., match, groups] lua_pushvalue(lstate, -1); + // [..., match, groups, groups] lua_setfield(lstate, -3, "groups"); // immediately insert into match - lua_insert(lstate, -2); // leave table below the match table + // [..., match, groups] + lua_insert(lstate, -2); // leave table below the match table + // [..., groups, match] } + // [groups.indices?, indices?, groups?, match] + for (int i = 0; i < capture_count; i++) { + uint32_t a, b; if (input->is_wide_char) { - const uint32_t a = input->indices[(capture[2 * i] - input->u.str8) / 2]; - const uint32_t b = - input->indices[(capture[2 * i + 1] - input->u.str8) / 2]; + a = input->indices[(capture[2 * i] - input->u.str8) / 2]; + b = input->indices[(capture[2 * i + 1] - input->u.str8) / 2]; lua_pushlstring(lstate, input->bstr + a, b - a); } else { + a = capture[2 * i] - input->u.str8; + b = capture[2 * i + 1] - input->u.str8; lua_pushlstring(lstate, (char *)capture[2 * i], capture[2 * i + 1] - capture[2 * i]); } + + if (has_indices) { + lua_createtable(lstate, 2, 0); + lua_pushinteger(lstate, a + 1); + lua_rawseti(lstate, -2, 1); + lua_pushinteger(lstate, b); + lua_rawseti(lstate, -2, 2); + // [..., match, string, {a, b}] + if (group_names) { + // [indices.groups, indices, groups, match, string, {a, b}] + if (i > 0 && *group_names) { + // if the current group is named, duplicate and insert into the + // correct table + lua_pushvalue(lstate, -1); + // [indices.groups, indices, groups, match, string, {a, b}, {a,b}] + lua_setfield(lstate, -7, group_names); + } + // [indices.groups, indices, groups, match, string, {a, b}] + lua_rawseti(lstate, -5, i); + } else { + // [indices, match, string, {a, b}] + lua_rawseti(lstate, -4, i); + } + } + if (i > 0 && group_names) { + // [..., groups, match, string] // if the current group is named, duplicate and insert into the correct // table if (*group_names) { lua_pushvalue(lstate, -1); + // [..., groups, match, string, string] lua_setfield(lstate, -4, group_names); group_names += strlen(group_names); } group_names++; } + // [..., match, string] lua_rawseti(lstate, -2, i); } @@ -507,6 +569,8 @@ static int regexp_index(lua_State *lstate) { lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_STICKY); } else if (streq(key, "unicode")) { lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_UNICODE); + } else if (streq(key, "has_indices")) { + lua_pushboolean(lstate, lre_get_flags(r->bc) & LRE_FLAG_INDICES); } else if (streq(key, "source")) { lua_pushstring(lstate, r->expr); } else if (streq(key, "flags")) { @@ -564,6 +628,9 @@ static int jsregexp_compile(lua_State *lstate) { const char *flags = luaL_checkstring(lstate, 2); while (*flags) { switch (*(flags++)) { + case 'd': + re_flags |= LRE_FLAG_INDICES; + break; case 'i': re_flags |= LRE_FLAG_IGNORECASE; break; From 4ad55b0dd071b4d9009932c3a5208e2571e86e3d Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Tue, 25 Jun 2024 16:36:03 +0200 Subject: [PATCH 6/8] test d flag --- test.lua | 126 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 34 deletions(-) diff --git a/test.lua b/test.lua index 7786b9f..34b8d67 100644 --- a/test.lua +++ b/test.lua @@ -1,4 +1,5 @@ local jsregexp = require("jsregexp") +local unpack = unpack or table.unpack local tests = 0 local fails = 0 @@ -130,6 +131,50 @@ local function test_exec(str, regex, flags, want) end end end + if match_wanted.indices and not match.indices then + return fail("expected indices table") + end + if not match_wanted.indices and match.indices then + return fail("expected no indices table") + end + if match_wanted.indices then + if match_wanted.indices.groups and not match.indices.groups then + return fail("expected indices.groups table") + end + if not match_wanted.indices.groups and match.indices.groups then + return fail("expected no indices.groups table") + end + for i = 0, #match.indices do + local a, b = unpack(match_wanted.indices[i]) + local c, d = unpack(match.indices[i]) + if a ~= c or b ~= d then + return fail( + string.format("wrong indices for group %d, expected {%d, %d}, got {%d, %d}", i, a, b, c, d) + ) + end + end + if match_wanted.indices.groups then + for key, val in pairs(match_wanted.indices.groups) do + if not match_wanted.indices.groups[key] then + return fail(string.format("unexpected key in indices.groups: %s", key)) + end + local a, b = unpack(match_wanted.indices.groups[key]) + local c, d = unpack(val) + if a ~= c or b ~= d then + return fail( + string.format( + "wrong indices for group %s, expected {%d, %d}, got {%d, %d}", + key, + a, + b, + c, + d + ) + ) + end + end + end + end end local match = r:exec(str) if r.global and match then @@ -388,42 +433,39 @@ test_call( -- test("จงฝ่าฟันพัฒนาวิชาการ", "(จงฝ่าฟันพัฒนาวิชาการ)", "", {{"จงฝ่าฟันพัฒนาวิชาการ", groups="จงฝ่าฟันพัฒนาวิชาการ"}}) -- named groups: -test_call( - "The quick brown fox jumps over the lazy dog", - "(?\\w+) (\\w+) (?\\w+)", - "n", - { { "The quick brown", groups = { "The", "quick", "brown" }, named_groups = { first_word = "The", third_word = "brown" } } } -) +test_call("The quick brown fox jumps over the lazy dog", "(?\\w+) (\\w+) (?\\w+)", "n", { + { + "The quick brown", + groups = { "The", "quick", "brown" }, + named_groups = { first_word = "The", third_word = "brown" }, + }, +}) test_call( "The qüick bröwn föx jümps över the lazy dög", "(?[^ ]+) ([^ ]+) (?[^ ]+)", "n", - { { "The qüick bröwn", groups = { "The", "qüick", "bröwn" }, named_groups = { - first_word = "The", - third_word = "bröwn", - } } } -) -test_call( - "The quick bröwn föx", - "(?[^ ]+) ([^ ]+) (?[^ ]+)", - "n", { { - "The quick bröwn", - groups = { "The", "quick", "bröwn" }, - named_groups = { ["first_wörd"] = "The", ["third_wörd"] = "bröwn" }, + "The qüick bröwn", + groups = { "The", "qüick", "bröwn" }, + named_groups = { + first_word = "The", + third_word = "bröwn", + }, }, } ) -test_call( - "𝄞𝄞 𐐷", - "(?[^ ]+)", - "ng", +test_call("The quick bröwn föx", "(?[^ ]+) ([^ ]+) (?[^ ]+)", "n", { { - { "𝄞𝄞", groups = { "𝄞𝄞" }, named_groups = { word = "𝄞𝄞" } }, - { "𐐷", groups = { "𐐷" }, named_groups = { word = "𐐷" } }, - } -) + "The quick bröwn", + groups = { "The", "quick", "bröwn" }, + named_groups = { ["first_wörd"] = "The", ["third_wörd"] = "bröwn" }, + }, +}) +test_call("𝄞𝄞 𐐷", "(?[^ ]+)", "ng", { + { "𝄞𝄞", groups = { "𝄞𝄞" }, named_groups = { word = "𝄞𝄞" } }, + { "𐐷", groups = { "𐐷" }, named_groups = { word = "𐐷" } }, +}) test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } }) test_exec( @@ -432,15 +474,31 @@ test_exec( "g", { { [0] = "The quick", "The", "quick" }, { [0] = "brown fox", "brown", "fox" } } ) -test_exec( - "The quick brown fox", - "(?\\w+) (\\w+)", - "g", +test_exec("The quick brown fox", "(?\\w+) (\\w+)", "g", { + { [0] = "The quick", "The", "quick", groups = { word1 = "The" } }, + { [0] = "brown fox", "brown", "fox", groups = { word1 = "brown" } }, +}) + +test_exec("The Quick Brown Fox Jumps Over The Lazy Dog", "quick\\s(?brown).+?(jumps)", "di", { { - { [0] = "The quick", "The", "quick", groups = { word1 = "The" } }, - { [0] = "brown fox", "brown", "fox", groups = { word1 = "brown" } }, - } -) + [0] = "Quick Brown Fox Jumps", + [1] = "Brown", + [2] = "Jumps", + indices = { + [0] = { 5, 25 }, + [1] = { 11, 15 }, + [2] = { 21, 25 }, + groups = { + color = { 11, 15 }, + }, + }, + index = 4, + input = "The Quick Brown Fox Jumps Over The Lazy Dog", + groups = { + color = "Brown", + }, + }, +}) test_test("The quick brown", "\\w+", "", { true }) test_test("The quick brown", "\\d+", "", { false }) From f99bfa058dfc0cf7b91f3d5d369d2f847f0ff9b3 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:19:00 +0200 Subject: [PATCH 7/8] update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c64e1f7..387e0a8 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ jsregexp.compile_safe(regex, flags?) ``` that take an ECMAScript regular expression as a string and an optional string of flags, most notably +- `"d"` provide tables with begin/end indices of match groups in match objects - `"i"`: case insensitive search - `"g"`: match globally - `"n"`: enables named groups (not present in JavaScript, needs to be enabled manually if needed) @@ -50,6 +51,7 @@ re.source -- the regexp string re.flags -- a string representing the active flags re.dot_all -- is the dod_all flag set? re.global -- is the global flag set? +re.has_indices -- is the indices flag set? re.ignore_case -- is the ignore_case flag set? re.multiline -- is the multiline flag set? re.sticky -- is the sticky flag set? @@ -88,6 +90,8 @@ m.input -- the input string m.capture_count -- number of capture groups m.index -- start of the capture (1-based) m.groups -- table of the named groups and their content +m.indices -- table of begin/end indices of all match groups (if "d" flag is set) +m.indices.groups -- table of named groups and their begin/end indices (if "d" flag is set) ``` Calling `tostring` on a match object returns the full match `m[0]`. From 30c86aca7e6ea740d035ff89f0ab14e63481f22d Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Fri, 5 Jul 2024 20:00:44 +0200 Subject: [PATCH 8/8] disable testing on macos --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ac07a63..2915895 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ jobs: "luajit-2.1.0-beta3", ] # TODO: add windows-latest once: https://github.com/leafo/gh-actions-lua/pull/23 is fully released - machineTag: ["ubuntu-latest", "macos-latest"] + machineTag: ["ubuntu-latest"] runs-on: ${{ matrix.machineTag }} steps: - uses: actions/checkout@v2