Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a UTF8 module as an alternative to String #2745

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/src/Streamly/Data/Fold.hs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ module Streamly.Data.Fold
, foldl1'
, foldlM1'
, foldr'
, foldrM'

-- * Folds
-- ** Accumulators
Expand Down
10 changes: 10 additions & 0 deletions core/src/Streamly/Internal/Data/Stream/Nesting.hs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ module Streamly.Internal.Data.Stream.Nesting
-- | A special case of parsing.
, wordsBy
, splitOnSeq -- XXX splitOnSeg
, splitOnSeqWith
, splitOnSuffixSeq -- XXX splitOnSegSuffix, splitOnTrailer

-- XXX Implement these as folds or parsers instead.
Expand Down Expand Up @@ -2108,6 +2109,15 @@ data SplitOnSeqState rb rh ck w fs s b x =

| SplitOnSeqReinit (fs -> SplitOnSeqState rb rh ck w fs s b x)

{-# INLINE_NORMAL splitOnSeqWith #-}
splitOnSeqWith
:: Array c
-> (a -> c)
-> Fold m a b
-> Stream m a
-> Stream m b
splitOnSeqWith = undefined

{-# INLINE_NORMAL splitOnSeq #-}
splitOnSeq
:: forall m a b. (MonadIO m, Storable a, Unbox a, Enum a, Eq a)
Expand Down
96 changes: 96 additions & 0 deletions core/src/Streamly/Internal/Unicode/Stream.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ module Streamly.Internal.Unicode.Stream
, decodeUtf8
, decodeUtf8'
, decodeUtf8_
, decodeUtf8Indexed
, decodeUtf8ReverseIndexed

-- ** UTF-16 Decoding
, decodeUtf16le'
Expand All @@ -39,6 +41,7 @@ module Streamly.Internal.Unicode.Stream
, CodePoint
, decodeUtf8Either
, resumeDecodeUtf8Either
, resumeDecodeUtf8EitherI

-- ** UTF-8 Array Stream Decoding
, decodeUtf8Chunks
Expand Down Expand Up @@ -348,6 +351,89 @@ data UTF8DecodeState s a
| YieldAndContinue a (UTF8DecodeState s a)
| Done

#ifndef __GHCJS__
{-# ANN type UTF8DecodeStateI Fuse #-}
#endif
data UTF8DecodeStateI i s a
= UTF8DecodeInitI i s
| UTF8DecodeInit1I i s !Word8
| UTF8DecodeFirstI i s !Word8
| UTF8DecodingI i i s !DecodeState !CodePoint
| YieldAndContinueI i i a (UTF8DecodeStateI i s a)
| DoneI

{-# INLINE_NORMAL resumeDecodeUtf8EitherI #-}
resumeDecodeUtf8EitherI
:: Monad m
=> DecodeState
-> CodePoint
-> Int
-> D.Stream m Word8
-> D.Stream m ((Int, Int), (Either DecodeError Char))
resumeDecodeUtf8EitherI dst codep ix0 (D.Stream step state) =
let stt =
if dst == 0
then UTF8DecodeInitI ix0 state
else UTF8DecodingI ix0 0 state dst codep
in D.Stream (step' utf8d) stt
where
{-# INLINE_LATE step' #-}
step' _ gst (UTF8DecodeInitI ix st) = do
r <- step (adaptState gst) st
return $ case r of
Yield x s -> Skip (UTF8DecodeInit1I ix s x)
Skip s -> Skip (UTF8DecodeInitI ix s)
Stop -> Skip DoneI

step' _ _ (UTF8DecodeInit1I ix st x) = do
-- Note: It is important to use a ">" instead of a "<=" test
-- here for GHC to generate code layout for default branch
-- prediction for the common case. This is fragile and might
-- change with the compiler versions, we need a more reliable
-- "likely" primitive to control branch predication.
case x > 0x7f of
False ->
return $ Skip $ YieldAndContinueI ix 1
(Right $ unsafeChr (fromIntegral x))
(UTF8DecodeInitI (ix + 1) st)
-- Using a separate state here generates a jump to a
-- separate code block in the core which seems to perform
-- slightly better for the non-ascii case.
True -> return $ Skip $ UTF8DecodeFirstI ix st x

-- XXX should we merge it with UTF8DecodeInit1?
step' table _ (UTF8DecodeFirstI ix st x) = do
let (Tuple' sv cp) = decode0 table x
return $
case sv of
12 ->
Skip $ YieldAndContinueI ix 1 (Left $ DecodeError 0 (fromIntegral x))
(UTF8DecodeInitI (ix + 1) st)
0 -> error "unreachable state"
_ -> Skip (UTF8DecodingI ix 1 st sv cp)

-- We recover by trying the new byte x a starter of a new codepoint.
-- XXX on error need to report the next byte "x" as well.
-- XXX need to use the same recovery in array decoding routine as well
step' table gst (UTF8DecodingI ix len st statePtr codepointPtr) = do
r <- step (adaptState gst) st
case r of
Yield x s -> do
let (Tuple' sv cp) = decode1 table statePtr codepointPtr x
return $
case sv of
0 -> Skip $ YieldAndContinueI ix (len + 1) (Right $ unsafeChr cp)
(UTF8DecodeInitI (ix + len + 1) s)
12 ->
Skip $ YieldAndContinueI ix (len + 1) (Left $ DecodeError statePtr codepointPtr)
(UTF8DecodeInit1I (ix + len + 1) s x)
_ -> Skip (UTF8DecodingI ix (len + 1) s sv cp)
Skip s -> return $ Skip (UTF8DecodingI ix len s statePtr codepointPtr)
Stop -> return $ Skip $ YieldAndContinueI ix len (Left $ DecodeError statePtr codepointPtr) DoneI

step' _ _ (YieldAndContinueI ix len c s) = return $ Yield ((ix, len), c) s
step' _ _ DoneI = return Stop

{-# INLINE_NORMAL resumeDecodeUtf8EitherD #-}
resumeDecodeUtf8EitherD
:: Monad m
Expand Down Expand Up @@ -558,6 +644,16 @@ parseCharUtf8With ::
Monad m => CodingFailureMode -> Parser.Parser Word8 m Char
parseCharUtf8With = parseCharUtf8WithD

{-# INLINE_NORMAL decodeUtf8ReverseIndexed #-}
decodeUtf8ReverseIndexed ::
CodingFailureMode -> D.Stream m Word8 -> D.Stream m ((Int, Int), Char)
decodeUtf8ReverseIndexed = undefined

{-# INLINE_NORMAL decodeUtf8Indexed #-}
decodeUtf8Indexed ::
CodingFailureMode -> D.Stream m Word8 -> D.Stream m ((Int, Int), Char)
decodeUtf8Indexed = undefined

-- XXX write it as a parser and use parseMany to decode a stream, need to check
-- if that preserves the same performance. Or we can use a resumable parser
-- that parses a chunk at a time.
Expand Down
Loading
Loading