diff --git a/examples/simple.roc b/examples/simple.roc index 0a410a5..5eae6fd 100644 --- a/examples/simple.roc +++ b/examples/simple.roc @@ -16,6 +16,6 @@ maybeLength = word |> Str.toUtf8 |> CodePoint.parseUtf8 |> Result.map List.len main = when maybeLength is - Ok count -> Stdout.line "\n\nThere are a total of \(Num.toStr count) code points in \(word)\n\n" - Err _ -> crash "ERROR: Unable to parse \(word)!" + Ok count -> Stdout.line "\n\nThere are a total of $(Num.toStr count) code points in $(word)\n\n" + Err _ -> crash "ERROR: Unable to parse $(word)!" diff --git a/package/CodePoint.roc b/package/CodePoint.roc index 6a3dfa5..cde6387 100644 --- a/package/CodePoint.roc +++ b/package/CodePoint.roc @@ -29,40 +29,34 @@ toU32 = InternalCP.toU32 ## (that is, it's between `0` and `0x10FFFF`). fromU32 : U32 -> Result CodePoint [InvalidCodePoint] fromU32 = \u32 -> - # Definition: http://www.unicode.org/glossary/#code_point if u32 <= 0x10FFFF then Ok (fromU32Unchecked u32) else Err InvalidCodePoint -## Returns false if this is either a [high-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point) -## or a [low-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point). -## -## To check for either of those individually, use [isHighSurrogate] or [isLowSurrogate] +## Returns false if this is [isHighSurrogate] or [isLowSurrogate] isValidScalar : CodePoint -> Bool isValidScalar = \codePoint -> !(isHighSurrogate codePoint || isLowSurrogate codePoint) ## Returns true if this is a [high-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point) -## (`0xD800` to `0xDBFF`) +## from U+D800 to U+DBFF isHighSurrogate : CodePoint -> Bool isHighSurrogate = \codePoint -> - u32 = InternalCP.toU32 codePoint - - u32 >= 0xDC00 && u32 <= 0xDFFF + u32 = toU32 codePoint + u32 >= 0xD800 && u32 <= 0xDBFF ## Returns true if this is a [low-surrogate code point](https://www.unicode.org/glossary/#low_surrogate_code_point) -## U+DC00 to U+DFFF +## from U+DC00 to U+DFFF isLowSurrogate : CodePoint -> Bool isLowSurrogate = \codePoint -> - u32 = InternalCP.toU32 codePoint - + u32 = toU32 codePoint u32 >= 0xDC00 && u32 <= 0xDFFF ## Zig docs: bytes the UTF-8 representation would require ## for the given codepoint. -utf8Len : CodePoint -> Result U64 [InvalidCodePoint] +utf8Len : CodePoint -> Result U8 [InvalidCodePoint] utf8Len = \codePoint -> - u32 = InternalCP.toU32 codePoint + u32 = toU32 codePoint if u32 < 0x80 then Ok 1 @@ -78,8 +72,7 @@ utf8Len = \codePoint -> ## Encode a Scalar as UTF-8 bytes and append those bytes to an existing list of UTF-8 bytes. appendUtf8 : List U8, CodePoint -> List U8 appendUtf8 = \bytes, codePoint -> - u32 = InternalCP.toU32 codePoint - + u32 = toU32 codePoint if u32 < 0x80 then List.append bytes (Num.toU8 u32) else if u32 < 0x800 then @@ -173,9 +166,9 @@ addContinuation = \original, continuationByte -> |> Num.bitwiseOr (Num.toU32 (Num.bitwiseAnd continuationByte 0b00111111)) ## The number of UTF-8 bytes it takes to represent this Scalar. -countUtf8Bytes : CodePoint -> U64 +countUtf8Bytes : CodePoint -> U8 countUtf8Bytes = \codePoint -> - u32 = InternalCP.toU32 codePoint + u32 = toU32 codePoint if u32 < 0x80 then 1 @@ -252,11 +245,9 @@ Utf8ParseErr : [OverlongEncoding, ExpectedContinuation, EncodesSurrogateHalf, In parseUtf8 : List U8 -> Result (List CodePoint) Utf8ParseErr parseUtf8 = \bytes -> - # we will have at most List.len bytes code points listWithCapacity : List CodePoint listWithCapacity = List.withCapacity (List.len bytes) - parseUtf8Help bytes listWithCapacity parseUtf8Help : List U8, List CodePoint -> Result (List CodePoint) Utf8ParseErr @@ -339,34 +330,34 @@ parsePartialUtf8 = \bytes -> else Err InvalidUtf8 - -toStr : List CP -> Result Str [BadUtf8] +toStr : List CodePoint -> Result Str [BadUtf8] toStr = \cps -> - # allocated extra space for the extra bytes as some CPs expand into + # allocated extra space for the extra bytes as some CPs expand into # multiple U8s, so this minimises extra allocations capacity = List.withCapacity (50 + List.len cps) - cps - |> cpsToStrHelp capacity + cps + |> cpsToStrHelp capacity |> Str.fromUtf8 |> Result.onErr \_ -> Err BadUtf8 -cpsToStrHelp : List CP, List U8 -> List U8 +cpsToStrHelp : List CodePoint, List U8 -> List U8 cpsToStrHelp = \cps, bytes -> - when cps is + when cps is [] -> bytes - [cp,..] -> - cpsToStrHelp + [cp, ..] -> + cpsToStrHelp (List.dropFirst cps 1) (CodePoint.appendUtf8 bytes cp) - -expect # test toStr - cr = (fromU32Unchecked 13) - lf = (fromU32Unchecked 10) + +expect + # test toStr + cr = fromU32Unchecked 13 + lf = fromU32Unchecked 10 toStr [cr, lf] == Ok "\r\n" - + ## Empty input expect [] |> parsePartialUtf8 == Err ListWasEmpty diff --git a/package/Grapheme.roc b/package/Grapheme.roc index d91729a..5240ece 100644 --- a/package/Grapheme.roc +++ b/package/Grapheme.roc @@ -18,11 +18,11 @@ Grapheme : InternalGBP.GBP # Note GB13 is not used, it has been merged with GB12 as they are identical as far as I can tell Rule : [GB1, GB2, GB3, GB4, GB5, GB6, GB7, GB8, GB9, GB9a, GB9b, GB9c, GB11, GB12, GB999] -# User internally to represent the text segmentation algorithm. We include the +# User internally to represent the text segmentation algorithm. We include the # Rules here so that it is feasible to debug this and ensure algorithm correctness -# We could remove these and reduce the number of allocations, however it is very +# We could remove these and reduce the number of allocations, however it is very # difficult then to understand if the implementation is applying each rule correctly -Tokens : List [BR Rule,NB Rule,CP CodePoint] +Tokens : List [BR Rule, NB Rule, CP CodePoint] ## Split a string into extended grapheme clusters split : Str -> Result (List Str) Utf8ParseErr @@ -63,19 +63,17 @@ splitHelp = \state, codePoints, breakPoints, acc -> nextBPs = List.dropFirst breakPoints 1 when (state, codePoints, breakPoints) is - # Special handling for last codepoint (Next, [cp], _) -> List.concat acc [CP cp, BR GB2] - (AfterHungulL prev, [cp], [bp]) if bp == L || bp == V || bp == LV || bp == LVT -> List.concat acc [CP prev, NB GB6, CP cp, BR GB2] - (AfterHungulLVorV prev, [cp], [bp]) if bp == V || bp == T -> List.concat acc [CP prev, NB GB7, CP cp, BR GB2] - (AfterHungulLVTorT prev, [cp], [bp]) if bp == T -> List.concat acc [CP prev, NB GB8, CP cp, BR GB2] - (AfterHungulL prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc - (AfterHungulLVorV prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc - (AfterHungulLVTorT prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc + (AfterHangulL prev, [cp], [bp]) if bp == L || bp == V || bp == LV || bp == LVT -> List.concat acc [CP prev, NB GB6, CP cp, BR GB2] + (AfterHangulLVorV prev, [cp], [bp]) if bp == V || bp == T -> List.concat acc [CP prev, NB GB7, CP cp, BR GB2] + (AfterHangulLVTorT prev, [cp], [bp]) if bp == T -> List.concat acc [CP prev, NB GB8, CP cp, BR GB2] + (AfterHangulL prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc + (AfterHangulLVorV prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc + (AfterHangulLVTorT prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc (LastWithPrev prev, [cp], [bp]) if bp == Control || bp == CR || bp == LF -> List.concat acc [CP prev, BR GB5, CP cp, BR GB2] (LastWithPrev prev, [cp], [bp]) if bp == Extend -> List.concat acc [CP prev, NB GB9, CP cp, BR GB2] (LastWithPrev prev, [cp], [bp]) if bp == ZWJ -> - if prev |> CodePoint.toU32 |> InternalEmoji.isPictographic then List.concat acc [CP prev, NB GB11, CP cp, BR GB2] else @@ -87,26 +85,21 @@ splitHelp = \state, codePoints, breakPoints, acc -> (EmojiSeqNext prev, [], []) -> List.concat acc [CP prev, BR GB2] (EmojiSeqNext prev, [cp], [_]) -> List.concat acc [CP prev, NB GB11, CP cp, BR GB2] (EmojiSeqZWJ prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc - (AfterEvenRI prev, [], []) -> List.concat acc [CP prev, BR GB2] (AfterOddRI prev, [], []) -> List.concat acc [CP prev, BR GB2] - # Looking at current breakpoint property (Next, [cp, ..], [bp, ..]) if bp == CR -> splitHelp (AfterCR cp) nextCPs nextBPs acc (Next, [cp, ..], [bp, ..]) if bp == Control || bp == LF -> splitHelp Next nextCPs nextBPs (List.concat acc [CP cp, BR GB4]) - (Next, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHungulL cp) nextCPs nextBPs acc - (Next, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs acc - (Next, [cp, ..], [bp, ..]) if bp == LVT || bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs acc + (Next, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHangulL cp) nextCPs nextBPs acc + (Next, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs acc + (Next, [cp, ..], [bp, ..]) if bp == LVT || bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs acc (Next, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterOddRI cp) nextCPs nextBPs acc - - # Advance to next, this is requred so that we can apply rules which break before + # Advance to next, this is required so that we can apply rules which break before (Next, [cp, ..], _) -> splitHelp (LookAtNext cp) nextCPs nextBPs acc - # Looking ahead at next, given previous (LookAtNext prev, _, [bp, ..]) if bp == Control || bp == CR || bp == LF -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB5]) (LookAtNext prev, [cp, ..], [bp, ..]) if bp == Extend -> splitHelp (AfterExtend cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9]) (LookAtNext prev, [cp, ..], [bp, ..]) if bp == ZWJ -> - if prev |> CodePoint.toU32 |> InternalEmoji.isPictographic then # enter emoji sequence splitHelp (EmojiSeqNext cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9]) @@ -115,7 +108,6 @@ splitHelp = \state, codePoints, breakPoints, acc -> # Look ahead, given previous was Emoji related (EmojiSeqZWJ prev, [cp, ..], [bp, ..]) -> - if bp == ZWJ then # got another ZWJ continue the sequence splitHelp (EmojiSeqNext cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB11]) @@ -123,7 +115,6 @@ splitHelp = \state, codePoints, breakPoints, acc -> splitHelp Next codePoints breakPoints acc (EmojiSeqNext prev, [cp, ..], [_, ..]) -> - if cp |> CodePoint.toU32 |> InternalEmoji.isPictographic then # got another emoji, continue the sequence splitHelp (EmojiSeqZWJ cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB11]) @@ -134,31 +125,27 @@ splitHelp = \state, codePoints, breakPoints, acc -> (AfterExtend prev, [cp, ..], [bp, ..]) if bp == Extend -> splitHelp (AfterExtend cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9]) (AfterExtend prev, [_, ..], [_, ..]) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB999]) (LookAtNext prev, _, _) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB999]) - # Looking ahead, given previous was a Regional Indicator (AfterOddRI prev, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterEvenRI cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB12]) (AfterOddRI prev, [_, ..], [_, ..]) -> splitHelp (LookAtNext prev) codePoints breakPoints acc (AfterEvenRI prev, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterOddRI cp) nextCPs nextBPs (List.concat acc [CP prev, BR GB999]) (AfterEvenRI prev, [_, ..], [_, ..]) -> splitHelp (LookAtNext prev) codePoints breakPoints acc - # Looking ahead, given previous was CR (AfterCR prev, _, [bp, ..]) if bp == LF -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, NB GB3]) (AfterCR prev, _, _) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB4]) - # Looking ahead, given previous was Hangul - (AfterHungulL prev, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHungulL cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6]) - (AfterHungulL prev, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6]) - (AfterHungulL prev, [cp, ..], [bp, ..]) if bp == LVT -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6]) - (AfterHungulL prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc - (AfterHungulL prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc - (AfterHungulLVorV prev, [cp, ..], [bp, ..]) if bp == V -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7]) - (AfterHungulLVorV prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7]) - (AfterHungulLVorV prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc - (AfterHungulLVorV prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc - (AfterHungulLVTorT prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB8]) - (AfterHungulLVTorT prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc - (AfterHungulLVTorT prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc - + (AfterHangulL prev, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHangulL cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6]) + (AfterHangulL prev, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6]) + (AfterHangulL prev, [cp, ..], [bp, ..]) if bp == LVT -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6]) + (AfterHangulL prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc + (AfterHangulL prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc + (AfterHangulLVorV prev, [cp, ..], [bp, ..]) if bp == V -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7]) + (AfterHangulLVorV prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7]) + (AfterHangulLVorV prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc + (AfterHangulLVorV prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc + (AfterHangulLVTorT prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB8]) + (AfterHangulLVTorT prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc + (AfterHangulLVTorT prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc # Print out a helpful error message requesting users report the unhandled case. _ -> crash @@ -168,10 +155,10 @@ splitHelp = \state, codePoints, breakPoints, acc -> It is difficult to track down and catch every possible combination, so it would be helpful if you could log this as an issue with a reproduction. Grapheme.split state machine state at the time was: - \(Inspect.toStr (state, List.map codePoints CodePoint.toU32, breakPoints)) + $(Inspect.toStr (state, List.map codePoints CodePoint.toU32, breakPoints)) """ -# Used internally as a test helper to generate the expected answer for a given +# Used internally as a test helper to generate the expected answer for a given # input. Most of the test inputs come from the test data, some are manually developed # to cover additional edge cases not found in the test data file. testHelp : List (List U32) -> Tokens @@ -342,7 +329,7 @@ expect ] a == b -# GB11 emoji another complicated example +# GB11 emoji another complicated example # % [0.2] LATIN SMALL LETTER A (Other) x [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) % [999.0] BABY (ExtPict) x [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) x [11.0] OCTAGONAL SIGN (ExtPict) % [0.3] expect a = testHelp [[97, 127999], [128118, 8205, 128721]] @@ -383,4 +370,4 @@ expect a == b expect split "πŸ₯·πŸΌ" == Ok ["πŸ₯·πŸΌ"] -expect split "πŸ‡¦πŸ‡ΊπŸ¦˜πŸͺƒ" == Ok ["πŸ‡¦πŸ‡Ί", "🦘", "πŸͺƒ"] \ No newline at end of file +expect split "πŸ‡¦πŸ‡ΊπŸ¦˜πŸͺƒ" == Ok ["πŸ‡¦πŸ‡Ί", "🦘", "πŸͺƒ"] diff --git a/package/GraphemeTestGen.roc b/package/GraphemeTestGen.roc index 73071cd..f643e88 100644 --- a/package/GraphemeTestGen.roc +++ b/package/GraphemeTestGen.roc @@ -1,6 +1,6 @@ ## The purpose of this file is to generate the GraphemeTest.roc test suite. -## -## This file will read the test data from `data/GraphemeBreakTest-15.1.0.txt` +## +## This file will read the test data from `data/GraphemeBreakTest-15.1.0.txt` ## parse it and then generate the individual tests. app "gen" packages { @@ -27,7 +27,7 @@ Rule : [GB1, GB2, GB3, GB4, GB5, GB6, GB7, GB8, GB9, GB9a, GB9b, GB9c, GB11, GB1 TestTokens : List [BR Rule, NB Rule, CP CodePoint] main : Task {} I32 -main = getFilePath |> Task.await writeToFile |> Task.onErr \err -> Stderr.line "\(err)" +main = getFilePath |> Task.await writeToFile |> Task.onErr \err -> Stderr.line "$(err)" template : Str template = @@ -51,18 +51,18 @@ template = test.lineStr |> Str.replaceEach "Γ·" "%" # replace % |> Str.replaceEach "Γ—" "x" # replace X - |> Str.replaceEach " " " " # reaplce tabs with a space + |> Str.replaceEach " " " " # replace tabs with a space codePointsList = test.parsed |> toU32List """ - # GraphemeBreakTest-15.1.0.txt:line \(Num.toStr test.lineNo) - # \(sanitisedLine) + # GraphemeBreakTest-15.1.0.txt:line $(Num.toStr test.lineNo) + # $(sanitisedLine) expect - exp = Ok \(codePointsList |> Inspect.toStr) + exp = Ok $(codePointsList |> Inspect.toStr) got = - \(codePointsList + $(codePointsList |> List.join |> Inspect.toStr) |> List.map InternalCP.fromU32Unchecked @@ -87,7 +87,7 @@ template = Ok cps -> List.map cps CodePoint.toU32 Err _ -> crash \"expected valid utf8\" - \(tests) + $(tests) """ getFilePath : Task Path Str @@ -95,14 +95,14 @@ getFilePath = args <- Arg.list |> Task.await when args |> List.get 1 is - Ok arg -> Task.ok (Path.fromStr "\(Helpers.removeTrailingSlash arg)/GraphemeTest.roc") + Ok arg -> Task.ok (Path.fromStr "$(Helpers.removeTrailingSlash arg)/GraphemeTest.roc") Err _ -> Task.err "USAGE: roc run GraphemeTest.roc -- path/to/package/" writeToFile : Path -> Task {} Str writeToFile = \path -> File.writeUtf8 path template - |> Task.mapErr \_ -> "ERROR: unable to write to \(Path.display path)" - |> Task.await \_ -> Stdout.line "\nSucessfully wrote to \(Path.display path)\n" + |> Task.mapErr \_ -> "ERROR: unable to write to $(Path.display path)" + |> Task.await \_ -> Stdout.line "\nSuccessfully wrote to $(Path.display path)\n" toU32List : TestTokens -> List (List U32) toU32List = \tokens -> @@ -129,7 +129,7 @@ testFile = |> List.map \test -> when parseStr testParser test.lineStr is Ok (Ok parsed) -> { test & parsed } - Err err | Ok (Err err) -> crash "Unable to parse line \(Num.toStr test.lineNo) got err \(Inspect.toStr err)" + Err err | Ok (Err err) -> crash "Unable to parse line $(Num.toStr test.lineNo) got err $(Inspect.toStr err)" testParser : Parser (List U8) (Result TestTokens _) testParser = @@ -244,7 +244,7 @@ zip = \first, second -> Ok (List.append next (CP cp)) (Err _, Err _) -> Ok [] # base case - _ -> Err (Invalid "expected first and second lists to match exactly got \(Inspect.toStr (T first second))") + _ -> Err (Invalid "expected first and second lists to match exactly got $(Inspect.toStr (T first second))") expect answer = diff --git a/package/Helpers.roc b/package/Helpers.roc index 62288e0..e1e2c54 100644 --- a/package/Helpers.roc +++ b/package/Helpers.roc @@ -1,4 +1,4 @@ -interface Helpers +interface Helpers exposes [ PropertyMap, CPMeta, @@ -9,7 +9,7 @@ interface Helpers startsWithHex, hexBytesToU32, hexStrToU32, - properyMapFromFile, + propertyMapFromFile, filterPropertyMap, metaToExpression, ] @@ -124,8 +124,8 @@ hexToDec = \byte -> expect hexToDec '0' == 0 expect hexToDec 'F' == 15 -properyMapFromFile : Str, (Str -> Result a [ParsingError]) -> List { cp : CPMeta, prop : a } -properyMapFromFile = \file, parsePropPart -> +propertyMapFromFile : Str, (Str -> Result a [ParsingError]) -> List { cp : CPMeta, prop : a } +propertyMapFromFile = \file, parsePropPart -> file |> Str.split "\n" |> List.keepOks Helpers.startsWithHex @@ -134,9 +134,9 @@ properyMapFromFile = \file, parsePropPart -> [hexPart, propPart] -> when (parseHexPart hexPart, parsePropPart propPart) is (Ok cp, Ok prop) -> { cp, prop } - _ -> crash "Error parsing line -- \(l)" + _ -> crash "Error parsing line -- $(l)" - _ -> crash "Error unexpected ';' on line -- \(l)" + _ -> crash "Error unexpected ';' on line -- $(l)" parseHexPart : Str -> Result CPMeta [ParsingError] parseHexPart = \hexPart -> @@ -173,5 +173,5 @@ expect codePointParser "# ===" == Err ParsingError metaToExpression : CPMeta -> Str metaToExpression = \cp -> when cp is - Single a -> "(u32 == \(Num.toStr a))" - Range a b -> "(u32 >= \(Num.toStr a) && u32 <= \(Num.toStr b))" + Single a -> "(u32 == $(Num.toStr a))" + Range a b -> "(u32 >= $(Num.toStr a) && u32 <= $(Num.toStr b))" diff --git a/package/InternalEmojiGen.roc b/package/InternalEmojiGen.roc index 764ef3c..84ca827 100644 --- a/package/InternalEmojiGen.roc +++ b/package/InternalEmojiGen.roc @@ -1,6 +1,6 @@ ## The purpose of this file is to generate the InternalEmoji.roc file. -## -## This file will read the test data from `data/emoji-data.txt` +## +## This file will read the test data from `data/emoji-data.txt` ## parse it and then generate the implementation for each of the Emoji properties. app "gen" packages { @@ -14,7 +14,7 @@ app "gen" pf.Arg, pf.File, "data/emoji-data.txt" as file : Str, - Helpers.{CPMeta, PropertyMap}, + Helpers.{ CPMeta, PropertyMap }, ] provides [main] to pf @@ -22,7 +22,7 @@ main : Task {} I32 main = getFilePath |> Task.await writeToFile - |> Task.onErr \err -> Stderr.line "\(err)" + |> Task.onErr \err -> Stderr.line "$(err)" EMOJIProp : [Emoji, Presentation, Modifier, Base, Component, Pictographic] EMOJIMeta : { fromBytes : List U8, property : EMOJIProp, toStr : Str } @@ -47,14 +47,14 @@ getFilePath = args <- Arg.list |> Task.await when args |> List.get 1 is - Ok arg -> Task.ok (Path.fromStr "\(Helpers.removeTrailingSlash arg)/InternalEmoji.roc") + Ok arg -> Task.ok (Path.fromStr "$(Helpers.removeTrailingSlash arg)/InternalEmoji.roc") Err _ -> Task.err "USAGE: roc run InternalEmoji.roc -- path/to/package/" writeToFile : Path -> Task {} Str writeToFile = \path -> File.writeUtf8 path template - |> Task.mapErr \_ -> "ERROR: unable to write to \(Path.display path)" - |> Task.await \_ -> Stdout.line "\nSucessfully wrote to \(Path.display path)\n" + |> Task.mapErr \_ -> "ERROR: unable to write to $(Path.display path)" + |> Task.await \_ -> Stdout.line "\nSuccessfully wrote to $(Path.display path)\n" template = """ @@ -63,10 +63,10 @@ template = exposes [EMOJI, fromCP, isPictographic] imports [InternalCP.{ CP, toU32 }] - \(propDefTemplate) - \(isFuncTemplate) + $(propDefTemplate) + $(isFuncTemplate) - \(fromCPTemplate) + $(fromCPTemplate) """ propDefTemplate : Str @@ -75,11 +75,11 @@ propDefTemplate = propStrs = listMeta |> List.map .toStr - |> List.map \str -> "\(str)" + |> List.map \str -> "$(str)" |> Str.joinWith ", " """ - EMOJI : [\(propStrs)] + EMOJI : [$(propStrs)] """ isFuncTemplate : Str @@ -94,13 +94,13 @@ isFuncTemplate = """ - \(name) : U32 -> Bool - \(name) = \\u32 -> \(exp) + $(name) : U32 -> Bool + $(name) = \\u32 -> $(exp) """ # For each EMOJIProp define a function that returns true if the given code point has that property listMeta - |> List.keepOks \{ property } -> + |> List.keepOks \{ property } -> when property is Emoji -> help "isEmoji" Emoji |> Ok Presentation -> help "isPresentation" Presentation |> Ok @@ -118,19 +118,19 @@ fromCPTemplate = u32 = toU32 cp - \(isXtemp listMeta "") + $(isXtemp listMeta "") """ # HELPERS # Parse the file to map between code points and properties fileMap : List (PropertyMap EMOJIProp) -fileMap = Helpers.properyMapFromFile file parsePropPart +fileMap = Helpers.propertyMapFromFile file parsePropPart # Make a helper that returns a list of code points for the given property cpsForProperty : EMOJIProp -> List CPMeta cpsForProperty = \current -> - Helpers.filterPropertyMap + Helpers.filterPropertyMap fileMap \{ cp, prop } -> if prop == current then Ok cp else Err NotNeeded @@ -165,36 +165,40 @@ expect emojiPropParser "Extended_Pictographic" == Ok Pictographic expect emojiPropParser "Emoji_Modifier_Base" == Ok Base expect emojiPropParser "# ===" == Err ParsingError - -# For each property, generate a function that returns true if the given code -# point has that property +# For each property, generate a function that returns true if the given code +# point has that property isXtemp : List EMOJIMeta, Str -> Str isXtemp = \props, buf -> when List.first props is Err ListWasEmpty -> - "\(buf)\n Err NonEmojiCodePoint\n" + "$(buf)\n Err NonEmojiCodePoint\n" Ok prop -> when prop.property is - Emoji -> + Emoji -> next = ifXStr "isEmoji" "Emoji" - isXtemp (List.dropFirst props 1) "\(buf)\(next)" - Presentation -> + isXtemp (List.dropFirst props 1) "$(buf)$(next)" + + Presentation -> next = ifXStr "isPresentation" "Presentation" - isXtemp (List.dropFirst props 1) "\(buf)\(next)" - Modifier -> + isXtemp (List.dropFirst props 1) "$(buf)$(next)" + + Modifier -> next = ifXStr "isModifier" "Modifier" - isXtemp (List.dropFirst props 1) "\(buf)\(next)" - Base -> + isXtemp (List.dropFirst props 1) "$(buf)$(next)" + + Base -> next = ifXStr "isBase" "Base" - isXtemp (List.dropFirst props 1) "\(buf)\(next)" - Component -> + isXtemp (List.dropFirst props 1) "$(buf)$(next)" + + Component -> next = ifXStr "isComponent" "Component" - isXtemp (List.dropFirst props 1) "\(buf)\(next)" - Pictographic -> + isXtemp (List.dropFirst props 1) "$(buf)$(next)" + + Pictographic -> next = ifXStr "isPictographic" "Pictographic" - isXtemp (List.dropFirst props 1) "\(buf)\(next)" + isXtemp (List.dropFirst props 1) "$(buf)$(next)" ifXStr : Str, Str -> Str ifXStr = \funcStr, str -> - "if \(funcStr) u32 then\n Ok \(str)\n else " + "if $(funcStr) u32 then\n Ok $(str)\n else " diff --git a/package/InternalGBPGen.roc b/package/InternalGBPGen.roc index 52d055e..e3a7494 100644 --- a/package/InternalGBPGen.roc +++ b/package/InternalGBPGen.roc @@ -1,6 +1,6 @@ ## The purpose of this file is to generate the InternalGBP.roc file. -## -## This file will read the test data from `data/GraphemeBreakProperty-15.1.0.txt` +## +## This file will read the test data from `data/GraphemeBreakProperty-15.1.0.txt` ## parse it and then generate the implementation for each of the GBP properties. app "gen" packages { @@ -14,7 +14,7 @@ app "gen" pf.Arg, pf.File, "data/GraphemeBreakProperty-15.1.0.txt" as file : Str, - Helpers.{CPMeta, PropertyMap}, + Helpers.{ CPMeta, PropertyMap }, ] provides [main] to pf @@ -22,7 +22,7 @@ main : Task {} I32 main = getFilePath |> Task.await writeToFile - |> Task.onErr \err -> Stderr.line "\(err)" + |> Task.onErr \err -> Stderr.line "$(err)" GBPProp : [CR, LF, Control, Extend, ZWJ, RI, Prepend, SpacingMark, L, V, T, LV, LVT, Other] GBPMeta : { fromBytes : List U8, property : GBPProp, toStr : Str } @@ -47,7 +47,7 @@ listMeta = { fromBytes: Str.toUtf8 "L", property: L, toStr: "L" }, { fromBytes: Str.toUtf8 "Other", property: Other, toStr: "Other" }, ] - + # TODO move these to a common helper file once module changes and builtin Task are available getFilePath : Task Path Str @@ -55,14 +55,14 @@ getFilePath = args <- Arg.list |> Task.await when args |> List.get 1 is - Ok arg -> Task.ok (Path.fromStr "\(Helpers.removeTrailingSlash arg)/InternalGBP.roc") + Ok arg -> Task.ok (Path.fromStr "$(Helpers.removeTrailingSlash arg)/InternalGBP.roc") Err _ -> Task.err "USAGE: roc run InternalGBP.roc -- path/to/package/" writeToFile : Path -> Task {} Str writeToFile = \path -> File.writeUtf8 path template - |> Task.mapErr \_ -> "ERROR: unable to write to \(Path.display path)" - |> Task.await \_ -> Stdout.line "\nSucessfully wrote to \(Path.display path)\n" + |> Task.mapErr \_ -> "ERROR: unable to write to $(Path.display path)" + |> Task.await \_ -> Stdout.line "\nSuccessfully wrote to $(Path.display path)\n" template = """ @@ -71,11 +71,11 @@ template = exposes [GBP, fromCP, isExtend, isZWJ] imports [InternalCP.{ CP, toU32, fromU32Unchecked }] - \(propDefTemplate) - \(isFuncTemplate) + $(propDefTemplate) + $(isFuncTemplate) - \(fromCPTemplate) - \(testsTemplate) + $(fromCPTemplate) + $(testsTemplate) """ propDefTemplate : Str @@ -84,11 +84,11 @@ propDefTemplate = propStrs = listMeta |> List.map .toStr - |> List.map \str -> "\(str)" + |> List.map \str -> "$(str)" |> Str.joinWith ", " """ - GBP : [\(propStrs)] + GBP : [$(propStrs)] """ isFuncTemplate : Str @@ -103,8 +103,8 @@ isFuncTemplate = """ - \(name) : U32 -> Bool - \(name) = \\u32 -> \(exp) + $(name) : U32 -> Bool + $(name) = \\u32 -> $(exp) """ # For each GBPProp define a function that returns true if the given code point has that property @@ -135,11 +135,11 @@ fromCPTemplate = u32 = toU32 cp - \(isXtemp listMeta "") + $(isXtemp listMeta "") """ testsTemplate : Str -testsTemplate = +testsTemplate = [ ("000D", "CR"), ("000A", "LF"), @@ -162,7 +162,7 @@ testsTemplate = ] |> List.map unicodeHexToTest |> Str.joinWith "\n\n" - + # HELPERS parsePropPart : Str -> Result GBPProp [ParsingError] @@ -177,96 +177,95 @@ expect parsePropPart " Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL L # Parse the file to map between code points and properties fileMap : List (PropertyMap GBPProp) -fileMap = Helpers.properyMapFromFile file parsePropPart +fileMap = Helpers.propertyMapFromFile file parsePropPart # Make a helper that returns a list of code points for the given property cpsForProperty : GBPProp -> List CPMeta cpsForProperty = \current -> - Helpers.filterPropertyMap + Helpers.filterPropertyMap fileMap \{ cp, prop } -> if prop == current then Ok cp else Err NotNeeded -# For each property, generate a function that returns true if the given code -# point has that property +# For each property, generate a function that returns true if the given code +# point has that property isXtemp : List GBPMeta, Str -> Str isXtemp = \props, buf -> when List.first props is Err ListWasEmpty -> - "\(buf)\n Other\n" + "$(buf)\n Other\n" Ok prop -> when prop.property is CR -> next = ifXStr "isCR" "CR" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") LF -> next = ifXStr "isLF" "LF" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") Control -> next = ifXStr "isControl" "Control" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") Extend -> next = ifXStr "isExtend" "Extend" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") ZWJ -> next = ifXStr "isZWJ" "ZWJ" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") RI -> next = ifXStr "isRI" "RI" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") Prepend -> next = ifXStr "isPrepend" "Prepend" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") SpacingMark -> next = ifXStr "isSpacingMark" "SpacingMark" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") L -> next = ifXStr "isL" "L" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") V -> next = ifXStr "isV" "V" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") T -> next = ifXStr "isT" "T" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") LV -> next = ifXStr "isLV" "LV" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") LVT -> next = ifXStr "isLVT" "LVT" - isXtemp (List.dropFirst props 1) ("\(buf)\(next)") + isXtemp (List.dropFirst props 1) ("$(buf)$(next)") Other -> isXtemp (List.dropFirst props 1) buf ifXStr : Str, Str -> Str ifXStr = \funcStr, str -> - "if \(funcStr) u32 then\n \(str)\n else " + "if $(funcStr) u32 then\n $(str)\n else " -# Helper to manually generate a test -unicodeHexToTest : (Str, Str) -> Str +# Helper to manually generate a test +unicodeHexToTest : (Str, Str) -> Str unicodeHexToTest = \(hex, gbpExpected) -> u32 = hex |> Str.toUtf8 |> Helpers.hexBytesToU32 """ - expect # test U+\(hex) gives \(gbpExpected) - gbp = fromCP (fromU32Unchecked \(Num.toStr u32)) - gbp == \(gbpExpected) + expect # test U+$(hex) gives $(gbpExpected) + gbp = fromCP (fromU32Unchecked $(Num.toStr u32)) + gbp == $(gbpExpected) """ - gbpPropParser : Str -> Result GBPProp [ParsingError] gbpPropParser = \input -> startsWithProp : GBPMeta -> Result GBPProp [NonPropSequence] @@ -292,4 +291,3 @@ expect gbpPropParser "LVT" == Ok LVT expect gbpPropParser "Other" == Ok Other expect gbpPropParser "# ===" == Err ParsingError - diff --git a/package/Scalar.roc b/package/Scalar.roc index 571590a..a4da234 100644 --- a/package/Scalar.roc +++ b/package/Scalar.roc @@ -3,7 +3,7 @@ interface Scalar Scalar, toU32, toCodePoint, - fromCodePt, + fromCodePoint, fromStr, toScalars, startsWithScalar, @@ -18,36 +18,34 @@ interface Scalar ] ## A [Unicode scalar value](http://www.unicode.org/glossary/#unicode_scalar_value) - that is, -## any [code point](./CodePoint#CodePoint) except for [high-surrogate](http://www.unicode.org/glossary/#high_surrogate_code_point) -## and [low-surrogate](http://www.unicode.org/glossary/#low_surrogate_code_point) code points. +## any [code point](./CodePoint#CodePoint) except for [high-surrogate](./CodePoint#isHighSurrogate) +## and [low-surrogate](./CodePoint#isLowSurrogate) code points. Scalar := CodePoint toU32 : Scalar -> U32 toU32 = \@Scalar cp -> CodePoint.toU32 cp -## Any Unicode code point except high-surrogate and low-surrogate code points. +## Any Unicode code point except high-surrogate and low-surrogate code points. ## ## Note UTF-8 does not use surrogates as it is a variable-width encoding unlike UTF-16. fromU32 : U32 -> Result Scalar [InvalidScalar] fromU32 = \u32 -> - - inRangeA = u32 >= 0x000000 && u32 <= 0xD7FF16 - inRangeB = u32 >= 0xE00016 && u32 <= 0x10FFFF16 - + inRangeA = u32 >= 0x0000 && u32 <= 0xD7FF + inRangeB = u32 >= 0xE000 && u32 <= 0x10FFFF if inRangeA || inRangeB then Ok (@Scalar (InternalCP.fromU32Unchecked u32)) else Err InvalidScalar toCodePoint : Scalar -> CodePoint -toCodePoint = \@Scalar cp -> cp +toCodePoint = \@Scalar codePoint -> codePoint ## Convert a code point to a scalar value. This can fail if the given ## code point is -fromCodePt : CodePoint -> Result Scalar [NonScalarCodePt] -fromCodePt = \cp -> - if isValidScalar cp then - Ok (@Scalar cp) +fromCodePoint : CodePoint -> Result Scalar [NonScalarCodePt] +fromCodePoint = \codePoint -> + if isValidScalar codePoint then + Ok (@Scalar codePoint) else Err NonScalarCodePt @@ -62,7 +60,7 @@ fromStr = \_str -> # Err InvalidScalar -> # u32str = Num.toStr u32 -# crash "appendToStr received a Scalar value of \(u32str). This is an invalid Unicode scalar value, so it should not have been possible to obtain a `Scalar` which wraps it!" +# crash "appendToStr received a Scalar value of $(u32str). This is an invalid Unicode scalar value, so it should not have been possible to obtain a `Scalar` which wraps it!" # TODO WHAT IS THIS? # walkStr : Str, state, (state, Scalar -> state) -> state