Skip to content

Commit

Permalink
Merge pull request #5 from ricardo-valero/quick-fix-cleanup
Browse files Browse the repository at this point in the history
Quick fix cleanup
  • Loading branch information
lukewilliamboswell authored Feb 27, 2024
2 parents 9b86820 + 65134fb commit 09f22e6
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 194 deletions.
4 changes: 2 additions & 2 deletions examples/simple.roc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ maybeLength = word |> Str.toUtf8 |> CodePoint.parseUtf8 |> Result.map List.len

main =
when maybeLength is
Ok count -> Stdout.line "\n\nThere are a total of \(Num.toStr count) code points in \(word)\n\n"
Err _ -> crash "ERROR: Unable to parse \(word)!"
Ok count -> Stdout.line "\n\nThere are a total of $(Num.toStr count) code points in $(word)\n\n"
Err _ -> crash "ERROR: Unable to parse $(word)!"

59 changes: 25 additions & 34 deletions package/CodePoint.roc
Original file line number Diff line number Diff line change
Expand Up @@ -29,40 +29,34 @@ toU32 = InternalCP.toU32
## (that is, it's between `0` and `0x10FFFF`).
fromU32 : U32 -> Result CodePoint [InvalidCodePoint]
fromU32 = \u32 ->
# Definition: http://www.unicode.org/glossary/#code_point
if u32 <= 0x10FFFF then
Ok (fromU32Unchecked u32)
else
Err InvalidCodePoint

## Returns false if this is either a [high-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point)
## or a [low-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point).
##
## To check for either of those individually, use [isHighSurrogate] or [isLowSurrogate]
## Returns false if this is [isHighSurrogate] or [isLowSurrogate]
isValidScalar : CodePoint -> Bool
isValidScalar = \codePoint -> !(isHighSurrogate codePoint || isLowSurrogate codePoint)

## Returns true if this is a [high-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point)
## (`0xD800` to `0xDBFF`)
## from U+D800 to U+DBFF
isHighSurrogate : CodePoint -> Bool
isHighSurrogate = \codePoint ->
u32 = InternalCP.toU32 codePoint

u32 >= 0xDC00 && u32 <= 0xDFFF
u32 = toU32 codePoint
u32 >= 0xD800 && u32 <= 0xDBFF

## Returns true if this is a [low-surrogate code point](https://www.unicode.org/glossary/#low_surrogate_code_point)
## U+DC00 to U+DFFF
## from U+DC00 to U+DFFF
isLowSurrogate : CodePoint -> Bool
isLowSurrogate = \codePoint ->
u32 = InternalCP.toU32 codePoint

u32 = toU32 codePoint
u32 >= 0xDC00 && u32 <= 0xDFFF

## Zig docs: bytes the UTF-8 representation would require
## for the given codepoint.
utf8Len : CodePoint -> Result U64 [InvalidCodePoint]
utf8Len : CodePoint -> Result U8 [InvalidCodePoint]
utf8Len = \codePoint ->
u32 = InternalCP.toU32 codePoint
u32 = toU32 codePoint

if u32 < 0x80 then
Ok 1
Expand All @@ -78,8 +72,7 @@ utf8Len = \codePoint ->
## Encode a Scalar as UTF-8 bytes and append those bytes to an existing list of UTF-8 bytes.
appendUtf8 : List U8, CodePoint -> List U8
appendUtf8 = \bytes, codePoint ->
u32 = InternalCP.toU32 codePoint

u32 = toU32 codePoint
if u32 < 0x80 then
List.append bytes (Num.toU8 u32)
else if u32 < 0x800 then
Expand Down Expand Up @@ -173,9 +166,9 @@ addContinuation = \original, continuationByte ->
|> Num.bitwiseOr (Num.toU32 (Num.bitwiseAnd continuationByte 0b00111111))

## The number of UTF-8 bytes it takes to represent this Scalar.
countUtf8Bytes : CodePoint -> U64
countUtf8Bytes : CodePoint -> U8
countUtf8Bytes = \codePoint ->
u32 = InternalCP.toU32 codePoint
u32 = toU32 codePoint

if u32 < 0x80 then
1
Expand Down Expand Up @@ -252,11 +245,9 @@ Utf8ParseErr : [OverlongEncoding, ExpectedContinuation, EncodesSurrogateHalf, In

parseUtf8 : List U8 -> Result (List CodePoint) Utf8ParseErr
parseUtf8 = \bytes ->

# we will have at most List.len bytes code points
listWithCapacity : List CodePoint
listWithCapacity = List.withCapacity (List.len bytes)

parseUtf8Help bytes listWithCapacity

parseUtf8Help : List U8, List CodePoint -> Result (List CodePoint) Utf8ParseErr
Expand Down Expand Up @@ -339,34 +330,34 @@ parsePartialUtf8 = \bytes ->
else
Err InvalidUtf8


toStr : List CP -> Result Str [BadUtf8]
toStr : List CodePoint -> Result Str [BadUtf8]
toStr = \cps ->

# allocated extra space for the extra bytes as some CPs expand into
# allocated extra space for the extra bytes as some CPs expand into
# multiple U8s, so this minimises extra allocations
capacity = List.withCapacity (50 + List.len cps)

cps
|> cpsToStrHelp capacity
cps
|> cpsToStrHelp capacity
|> Str.fromUtf8
|> Result.onErr \_ -> Err BadUtf8

cpsToStrHelp : List CP, List U8 -> List U8
cpsToStrHelp : List CodePoint, List U8 -> List U8
cpsToStrHelp = \cps, bytes ->
when cps is
when cps is
[] -> bytes
[cp,..] ->
cpsToStrHelp
[cp, ..] ->
cpsToStrHelp
(List.dropFirst cps 1)
(CodePoint.appendUtf8 bytes cp)

expect # test toStr
cr = (fromU32Unchecked 13)
lf = (fromU32Unchecked 10)

expect
# test toStr
cr = fromU32Unchecked 13
lf = fromU32Unchecked 10

toStr [cr, lf] == Ok "\r\n"

## Empty input
expect [] |> parsePartialUtf8 == Err ListWasEmpty

Expand Down
71 changes: 29 additions & 42 deletions package/Grapheme.roc
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ Grapheme : InternalGBP.GBP
# Note GB13 is not used, it has been merged with GB12 as they are identical as far as I can tell
Rule : [GB1, GB2, GB3, GB4, GB5, GB6, GB7, GB8, GB9, GB9a, GB9b, GB9c, GB11, GB12, GB999]

# User internally to represent the text segmentation algorithm. We include the
# User internally to represent the text segmentation algorithm. We include the
# Rules here so that it is feasible to debug this and ensure algorithm correctness
# We could remove these and reduce the number of allocations, however it is very
# We could remove these and reduce the number of allocations, however it is very
# difficult then to understand if the implementation is applying each rule correctly
Tokens : List [BR Rule,NB Rule,CP CodePoint]
Tokens : List [BR Rule, NB Rule, CP CodePoint]

## Split a string into extended grapheme clusters
split : Str -> Result (List Str) Utf8ParseErr
Expand Down Expand Up @@ -63,19 +63,17 @@ splitHelp = \state, codePoints, breakPoints, acc ->
nextBPs = List.dropFirst breakPoints 1

when (state, codePoints, breakPoints) is

# Special handling for last codepoint
(Next, [cp], _) -> List.concat acc [CP cp, BR GB2]
(AfterHungulL prev, [cp], [bp]) if bp == L || bp == V || bp == LV || bp == LVT -> List.concat acc [CP prev, NB GB6, CP cp, BR GB2]
(AfterHungulLVorV prev, [cp], [bp]) if bp == V || bp == T -> List.concat acc [CP prev, NB GB7, CP cp, BR GB2]
(AfterHungulLVTorT prev, [cp], [bp]) if bp == T -> List.concat acc [CP prev, NB GB8, CP cp, BR GB2]
(AfterHungulL prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHungulLVorV prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHungulLVTorT prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHangulL prev, [cp], [bp]) if bp == L || bp == V || bp == LV || bp == LVT -> List.concat acc [CP prev, NB GB6, CP cp, BR GB2]
(AfterHangulLVorV prev, [cp], [bp]) if bp == V || bp == T -> List.concat acc [CP prev, NB GB7, CP cp, BR GB2]
(AfterHangulLVTorT prev, [cp], [bp]) if bp == T -> List.concat acc [CP prev, NB GB8, CP cp, BR GB2]
(AfterHangulL prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHangulLVorV prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHangulLVTorT prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(LastWithPrev prev, [cp], [bp]) if bp == Control || bp == CR || bp == LF -> List.concat acc [CP prev, BR GB5, CP cp, BR GB2]
(LastWithPrev prev, [cp], [bp]) if bp == Extend -> List.concat acc [CP prev, NB GB9, CP cp, BR GB2]
(LastWithPrev prev, [cp], [bp]) if bp == ZWJ ->

if prev |> CodePoint.toU32 |> InternalEmoji.isPictographic then
List.concat acc [CP prev, NB GB11, CP cp, BR GB2]
else
Expand All @@ -87,26 +85,21 @@ splitHelp = \state, codePoints, breakPoints, acc ->
(EmojiSeqNext prev, [], []) -> List.concat acc [CP prev, BR GB2]
(EmojiSeqNext prev, [cp], [_]) -> List.concat acc [CP prev, NB GB11, CP cp, BR GB2]
(EmojiSeqZWJ prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc

(AfterEvenRI prev, [], []) -> List.concat acc [CP prev, BR GB2]
(AfterOddRI prev, [], []) -> List.concat acc [CP prev, BR GB2]

# Looking at current breakpoint property
(Next, [cp, ..], [bp, ..]) if bp == CR -> splitHelp (AfterCR cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == Control || bp == LF -> splitHelp Next nextCPs nextBPs (List.concat acc [CP cp, BR GB4])
(Next, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHungulL cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == LVT || bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHangulL cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == LVT || bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterOddRI cp) nextCPs nextBPs acc

# Advance to next, this is requred so that we can apply rules which break before
# Advance to next, this is required so that we can apply rules which break before
(Next, [cp, ..], _) -> splitHelp (LookAtNext cp) nextCPs nextBPs acc

# Looking ahead at next, given previous
(LookAtNext prev, _, [bp, ..]) if bp == Control || bp == CR || bp == LF -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB5])
(LookAtNext prev, [cp, ..], [bp, ..]) if bp == Extend -> splitHelp (AfterExtend cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9])
(LookAtNext prev, [cp, ..], [bp, ..]) if bp == ZWJ ->

if prev |> CodePoint.toU32 |> InternalEmoji.isPictographic then
# enter emoji sequence
splitHelp (EmojiSeqNext cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9])
Expand All @@ -115,15 +108,13 @@ splitHelp = \state, codePoints, breakPoints, acc ->

# Look ahead, given previous was Emoji related
(EmojiSeqZWJ prev, [cp, ..], [bp, ..]) ->

if bp == ZWJ then
# got another ZWJ continue the sequence
splitHelp (EmojiSeqNext cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB11])
else
splitHelp Next codePoints breakPoints acc

(EmojiSeqNext prev, [cp, ..], [_, ..]) ->

if cp |> CodePoint.toU32 |> InternalEmoji.isPictographic then
# got another emoji, continue the sequence
splitHelp (EmojiSeqZWJ cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB11])
Expand All @@ -134,31 +125,27 @@ splitHelp = \state, codePoints, breakPoints, acc ->
(AfterExtend prev, [cp, ..], [bp, ..]) if bp == Extend -> splitHelp (AfterExtend cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9])
(AfterExtend prev, [_, ..], [_, ..]) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB999])
(LookAtNext prev, _, _) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB999])

# Looking ahead, given previous was a Regional Indicator
(AfterOddRI prev, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterEvenRI cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB12])
(AfterOddRI prev, [_, ..], [_, ..]) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterEvenRI prev, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterOddRI cp) nextCPs nextBPs (List.concat acc [CP prev, BR GB999])
(AfterEvenRI prev, [_, ..], [_, ..]) -> splitHelp (LookAtNext prev) codePoints breakPoints acc

# Looking ahead, given previous was CR
(AfterCR prev, _, [bp, ..]) if bp == LF -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, NB GB3])
(AfterCR prev, _, _) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB4])

# Looking ahead, given previous was Hangul
(AfterHungulL prev, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHungulL cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHungulL prev, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHungulL prev, [cp, ..], [bp, ..]) if bp == LVT -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHungulL prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHungulL prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHungulLVorV prev, [cp, ..], [bp, ..]) if bp == V -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHungulLVorV prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHungulLVorV prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHungulLVorV prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHungulLVTorT prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB8])
(AfterHungulLVTorT prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHungulLVTorT prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc

(AfterHangulL prev, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHangulL cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHangulL prev, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHangulL prev, [cp, ..], [bp, ..]) if bp == LVT -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHangulL prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHangulL prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHangulLVorV prev, [cp, ..], [bp, ..]) if bp == V -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHangulLVorV prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHangulLVorV prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHangulLVorV prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHangulLVTorT prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB8])
(AfterHangulLVTorT prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHangulLVTorT prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
# Print out a helpful error message requesting users report the unhandled case.
_ ->
crash
Expand All @@ -168,10 +155,10 @@ splitHelp = \state, codePoints, breakPoints, acc ->
It is difficult to track down and catch every possible combination, so it would be helpful if you could log this as an issue with a reproduction.
Grapheme.split state machine state at the time was:
\(Inspect.toStr (state, List.map codePoints CodePoint.toU32, breakPoints))
$(Inspect.toStr (state, List.map codePoints CodePoint.toU32, breakPoints))
"""

# Used internally as a test helper to generate the expected answer for a given
# Used internally as a test helper to generate the expected answer for a given
# input. Most of the test inputs come from the test data, some are manually developed
# to cover additional edge cases not found in the test data file.
testHelp : List (List U32) -> Tokens
Expand Down Expand Up @@ -342,7 +329,7 @@ expect
]
a == b

# GB11 emoji another complicated example
# GB11 emoji another complicated example
# % [0.2] LATIN SMALL LETTER A (Other) x [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) % [999.0] BABY (ExtPict) x [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) x [11.0] OCTAGONAL SIGN (ExtPict) % [0.3]
expect
a = testHelp [[97, 127999], [128118, 8205, 128721]]
Expand Down Expand Up @@ -383,4 +370,4 @@ expect
a == b

expect split "πŸ₯·πŸΌ" == Ok ["πŸ₯·πŸΌ"]
expect split "πŸ‡¦πŸ‡ΊπŸ¦˜πŸͺƒ" == Ok ["πŸ‡¦πŸ‡Ί", "🦘", "πŸͺƒ"]
expect split "πŸ‡¦πŸ‡ΊπŸ¦˜πŸͺƒ" == Ok ["πŸ‡¦πŸ‡Ί", "🦘", "πŸͺƒ"]
Loading

0 comments on commit 09f22e6

Please sign in to comment.