From a9193cb80d8e9fbd40bb8707916ae6985195876d Mon Sep 17 00:00:00 2001 From: Adithya Kumar Date: Mon, 13 May 2024 10:25:24 +0530 Subject: [PATCH 1/2] Expose Fold.foldrM' --- core/src/Streamly/Data/Fold.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/Streamly/Data/Fold.hs b/core/src/Streamly/Data/Fold.hs index b83c949555..e29796f253 100644 --- a/core/src/Streamly/Data/Fold.hs +++ b/core/src/Streamly/Data/Fold.hs @@ -206,6 +206,7 @@ module Streamly.Data.Fold , foldl1' , foldlM1' , foldr' + , foldrM' -- * Folds -- ** Accumulators From 33c1c6190e3d1151a210c73b884d795383250adc Mon Sep 17 00:00:00 2001 From: Adithya Kumar Date: Mon, 13 May 2024 10:28:22 +0530 Subject: [PATCH 2/2] Add a UTF8 module with the goal of replacing String --- .../Streamly/Internal/Data/Stream/Nesting.hs | 10 + core/src/Streamly/Internal/Unicode/Stream.hs | 96 ++ core/src/Streamly/Internal/Unicode/UTF8.hs | 877 ++++++++++++++++++ core/streamly-core.cabal | 2 + default.nix | 12 +- src/Streamly/Internal/Unicode/Utf8.hs | 73 -- 6 files changed, 991 insertions(+), 79 deletions(-) create mode 100644 core/src/Streamly/Internal/Unicode/UTF8.hs delete mode 100644 src/Streamly/Internal/Unicode/Utf8.hs diff --git a/core/src/Streamly/Internal/Data/Stream/Nesting.hs b/core/src/Streamly/Internal/Data/Stream/Nesting.hs index 738eec5fc8..450c2b88cb 100644 --- a/core/src/Streamly/Internal/Data/Stream/Nesting.hs +++ b/core/src/Streamly/Internal/Data/Stream/Nesting.hs @@ -132,6 +132,7 @@ module Streamly.Internal.Data.Stream.Nesting -- | A special case of parsing. , wordsBy , splitOnSeq -- XXX splitOnSeg + , splitOnSeqWith , splitOnSuffixSeq -- XXX splitOnSegSuffix, splitOnTrailer -- XXX Implement these as folds or parsers instead. @@ -2108,6 +2109,15 @@ data SplitOnSeqState rb rh ck w fs s b x = | SplitOnSeqReinit (fs -> SplitOnSeqState rb rh ck w fs s b x) +{-# INLINE_NORMAL splitOnSeqWith #-} +splitOnSeqWith + :: Array c + -> (a -> c) + -> Fold m a b + -> Stream m a + -> Stream m b +splitOnSeqWith = undefined + {-# INLINE_NORMAL splitOnSeq #-} splitOnSeq :: forall m a b. (MonadIO m, Storable a, Unbox a, Enum a, Eq a) diff --git a/core/src/Streamly/Internal/Unicode/Stream.hs b/core/src/Streamly/Internal/Unicode/Stream.hs index bd364f45a0..1b84c99c13 100644 --- a/core/src/Streamly/Internal/Unicode/Stream.hs +++ b/core/src/Streamly/Internal/Unicode/Stream.hs @@ -28,6 +28,8 @@ module Streamly.Internal.Unicode.Stream , decodeUtf8 , decodeUtf8' , decodeUtf8_ + , decodeUtf8Indexed + , decodeUtf8ReverseIndexed -- ** UTF-16 Decoding , decodeUtf16le' @@ -39,6 +41,7 @@ module Streamly.Internal.Unicode.Stream , CodePoint , decodeUtf8Either , resumeDecodeUtf8Either + , resumeDecodeUtf8EitherI -- ** UTF-8 Array Stream Decoding , decodeUtf8Chunks @@ -348,6 +351,89 @@ data UTF8DecodeState s a | YieldAndContinue a (UTF8DecodeState s a) | Done +#ifndef __GHCJS__ +{-# ANN type UTF8DecodeStateI Fuse #-} +#endif +data UTF8DecodeStateI i s a + = UTF8DecodeInitI i s + | UTF8DecodeInit1I i s !Word8 + | UTF8DecodeFirstI i s !Word8 + | UTF8DecodingI i i s !DecodeState !CodePoint + | YieldAndContinueI i i a (UTF8DecodeStateI i s a) + | DoneI + +{-# INLINE_NORMAL resumeDecodeUtf8EitherI #-} +resumeDecodeUtf8EitherI + :: Monad m + => DecodeState + -> CodePoint + -> Int + -> D.Stream m Word8 + -> D.Stream m ((Int, Int), (Either DecodeError Char)) +resumeDecodeUtf8EitherI dst codep ix0 (D.Stream step state) = + let stt = + if dst == 0 + then UTF8DecodeInitI ix0 state + else UTF8DecodingI ix0 0 state dst codep + in D.Stream (step' utf8d) stt + where + {-# INLINE_LATE step' #-} + step' _ gst (UTF8DecodeInitI ix st) = do + r <- step (adaptState gst) st + return $ case r of + Yield x s -> Skip (UTF8DecodeInit1I ix s x) + Skip s -> Skip (UTF8DecodeInitI ix s) + Stop -> Skip DoneI + + step' _ _ (UTF8DecodeInit1I ix st x) = do + -- Note: It is important to use a ">" instead of a "<=" test + -- here for GHC to generate code layout for default branch + -- prediction for the common case. This is fragile and might + -- change with the compiler versions, we need a more reliable + -- "likely" primitive to control branch predication. + case x > 0x7f of + False -> + return $ Skip $ YieldAndContinueI ix 1 + (Right $ unsafeChr (fromIntegral x)) + (UTF8DecodeInitI (ix + 1) st) + -- Using a separate state here generates a jump to a + -- separate code block in the core which seems to perform + -- slightly better for the non-ascii case. + True -> return $ Skip $ UTF8DecodeFirstI ix st x + + -- XXX should we merge it with UTF8DecodeInit1? + step' table _ (UTF8DecodeFirstI ix st x) = do + let (Tuple' sv cp) = decode0 table x + return $ + case sv of + 12 -> + Skip $ YieldAndContinueI ix 1 (Left $ DecodeError 0 (fromIntegral x)) + (UTF8DecodeInitI (ix + 1) st) + 0 -> error "unreachable state" + _ -> Skip (UTF8DecodingI ix 1 st sv cp) + + -- We recover by trying the new byte x a starter of a new codepoint. + -- XXX on error need to report the next byte "x" as well. + -- XXX need to use the same recovery in array decoding routine as well + step' table gst (UTF8DecodingI ix len st statePtr codepointPtr) = do + r <- step (adaptState gst) st + case r of + Yield x s -> do + let (Tuple' sv cp) = decode1 table statePtr codepointPtr x + return $ + case sv of + 0 -> Skip $ YieldAndContinueI ix (len + 1) (Right $ unsafeChr cp) + (UTF8DecodeInitI (ix + len + 1) s) + 12 -> + Skip $ YieldAndContinueI ix (len + 1) (Left $ DecodeError statePtr codepointPtr) + (UTF8DecodeInit1I (ix + len + 1) s x) + _ -> Skip (UTF8DecodingI ix (len + 1) s sv cp) + Skip s -> return $ Skip (UTF8DecodingI ix len s statePtr codepointPtr) + Stop -> return $ Skip $ YieldAndContinueI ix len (Left $ DecodeError statePtr codepointPtr) DoneI + + step' _ _ (YieldAndContinueI ix len c s) = return $ Yield ((ix, len), c) s + step' _ _ DoneI = return Stop + {-# INLINE_NORMAL resumeDecodeUtf8EitherD #-} resumeDecodeUtf8EitherD :: Monad m @@ -558,6 +644,16 @@ parseCharUtf8With :: Monad m => CodingFailureMode -> Parser.Parser Word8 m Char parseCharUtf8With = parseCharUtf8WithD +{-# INLINE_NORMAL decodeUtf8ReverseIndexed #-} +decodeUtf8ReverseIndexed :: + CodingFailureMode -> D.Stream m Word8 -> D.Stream m ((Int, Int), Char) +decodeUtf8ReverseIndexed = undefined + +{-# INLINE_NORMAL decodeUtf8Indexed #-} +decodeUtf8Indexed :: + CodingFailureMode -> D.Stream m Word8 -> D.Stream m ((Int, Int), Char) +decodeUtf8Indexed = undefined + -- XXX write it as a parser and use parseMany to decode a stream, need to check -- if that preserves the same performance. Or we can use a resumable parser -- that parses a chunk at a time. diff --git a/core/src/Streamly/Internal/Unicode/UTF8.hs b/core/src/Streamly/Internal/Unicode/UTF8.hs new file mode 100644 index 0000000000..9a0de2272a --- /dev/null +++ b/core/src/Streamly/Internal/Unicode/UTF8.hs @@ -0,0 +1,877 @@ +{-# LANGUAGE CPP #-} + +module Streamly.Internal.Unicode.UTF8 + (module Streamly.Internal.Unicode.UTF8) where + +-------------------------------------------------------------------------------- +-- Imports +-------------------------------------------------------------------------------- + +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Function ((&)) +import Data.Word (Word8) +import Streamly.Data.Array (Array) +import Streamly.Data.Fold (Fold) +import Streamly.Data.Stream (Stream) +import System.IO.Unsafe (unsafePerformIO) +import Data.Functor.Identity (Identity(..)) +import GHC.Exts (Addr#) + +import qualified Data.List as L +import qualified Streamly.Data.Array as Array +import qualified Streamly.Data.Fold as Fold +import qualified Streamly.Data.MutArray as MArray +import qualified Streamly.Data.Parser as Parser +import qualified Streamly.Data.Stream as Stream +import qualified Streamly.Data.StreamK as StreamK +import qualified Streamly.Data.Unfold as Unfold +import qualified Streamly.Unicode.Stream as Unicode +import qualified Unicode.Char as Char + +import qualified Streamly.Internal.Data.StreamK as StreamK (concatMap) +import qualified Streamly.Internal.Data.Array as Array + (unsafeFreeze, unsafeThaw, empty, getSliceUnsafe, fromChunks, fromByteStr#) +import qualified Streamly.Internal.Data.MutArray as MArray + (spliceExp, blockSize, spliceUnsafe) +import qualified Streamly.Internal.Unicode.Stream as Unicode + ( readCharUtf8', decodeUtf8Indexed, decodeUtf8ReverseIndexed + , CodingFailureMode(..), fromStr# + ) +import qualified Unicode.Internal.Unfold as UUnfold (toList) +import qualified Streamly.Internal.Data.Stream as Stream (splitOnSeqWith) + +import Prelude as P hiding (read, length, all, dropWhile) + +-------------------------------------------------------------------------------- +-- CPP +-------------------------------------------------------------------------------- + +#define INLINEABLE(x) {-# INLINEABLE x #-};x +#define UNIMPLEMENTED(x) x = error "The functions x is unimplemented" + +-------------------------------------------------------------------------------- +-- Type +-------------------------------------------------------------------------------- + +-- | A space efficient, packed, unboxed Unicode container. +newtype UTF8 = + UTF8 { unUTF8 :: Array Word8 } + +-------------------------------------------------------------------------------- +-- Functions +-------------------------------------------------------------------------------- + +{-# INLINE read #-} +read :: Monad m => UTF8 -> Stream m Char +read (UTF8 arr) = Unicode.decodeUtf8' $ Array.read arr + +{-# INLINE ordM #-} +ordM :: MonadIO m => Char -> m (Array Word8) +ordM = + Stream.fold (Array.createOf 4) + . Stream.unfold Unicode.readCharUtf8' + +INLINEABLE(appendM) :: MonadIO m => UTF8 -> UTF8 -> m UTF8 +appendM (UTF8 a) (UTF8 b) = + fmap (UTF8 . Array.unsafeFreeze) + $ MArray.spliceExp (Array.unsafeThaw a) (Array.unsafeThaw b) + +INLINEABLE(append) :: UTF8 -> UTF8 -> UTF8 +append a b = unsafePerformIO $ appendM a b + +{-# INLINE createOf #-} +createOf :: MonadIO m => Int -> Fold m Char UTF8 +createOf i = + Fold.foldlM' MArray.spliceExp (MArray.emptyOf i) + & Fold.lmapM (fmap Array.unsafeThaw . ordM) + & fmap (UTF8 . Array.unsafeFreeze) + +{-# INLINE create #-} +create :: MonadIO m => Fold m Char UTF8 +create = createOf MArray.blockSize + +INLINEABLE(fromStream) :: MonadIO m => Stream m Char -> m UTF8 +fromStream strm = + Unicode.encodeUtf8' strm + & Stream.fold Array.create + & fmap UTF8 + +INLINEABLE(fromStreamOf) :: MonadIO m => Int -> Stream m Char -> m UTF8 +fromStreamOf n strm = + Unicode.encodeUtf8' strm + & Stream.fold (Array.createOf n) + & fmap UTF8 + +-------------------------------------------------------------------------------- +-- Creation and elimination +-------------------------------------------------------------------------------- + +INLINEABLE(pack) :: String -> UTF8 +pack s = unsafePerformIO $ fromStreamOf (L.length s) $ Stream.fromList s + +INLINEABLE(unpack) :: UTF8 -> String +unpack text = unsafePerformIO $ Stream.fold Fold.toList $ read text + +INLINEABLE(singleton) :: Char -> UTF8 +singleton c = UTF8 $ unsafePerformIO $ ordM c + +INLINEABLE(empty) :: UTF8 +empty = UTF8 Array.empty + +-------------------------------------------------------------------------------- +-- Basic interface +-------------------------------------------------------------------------------- + +INLINEABLE(cons) :: Char -> UTF8 -> UTF8 +cons c s = append (singleton c) s + +INLINEABLE(snoc) :: UTF8 -> Char -> UTF8 +snoc s c = append s (singleton c) + +INLINEABLE(uncons) :: UTF8 -> Maybe (Char, UTF8) +uncons (UTF8 arr) = unsafePerformIO $ do + res <- + Array.read arr + & Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure + & Stream.fold Fold.one + case res of + Just ((_, i), c) -> + let txt = UTF8 $ Array.getSliceUnsafe i (Array.length arr - i) arr + in pure $ Just (c, txt) + Nothing -> pure Nothing + +INLINEABLE(unsnoc) :: UTF8 -> Maybe (Char, UTF8) +unsnoc (UTF8 arr) = unsafePerformIO $ do + res <- + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.fold Fold.one + case res of + Just ((_, i), c) -> + let txt = UTF8 $ Array.getSliceUnsafe 0 (Array.length arr - i) arr + in pure $ Just (c, txt) + Nothing -> pure Nothing + +INLINEABLE(head) :: UTF8 -> Maybe Char +head = fmap fst . uncons + +INLINEABLE(last) :: UTF8 -> Maybe Char +last = fmap fst . unsnoc + +INLINEABLE(tail) :: UTF8 -> Maybe UTF8 +tail = fmap snd . uncons + +INLINEABLE(init) :: UTF8 -> Maybe UTF8 +init = fmap snd . unsnoc + +INLINEABLE(null) :: UTF8 -> Bool +null (UTF8 arr) = Array.length arr == 0 + +INLINEABLE(length) :: UTF8 -> Int +length text = unsafePerformIO $ Stream.fold Fold.length $ read text + +INLINEABLE(byteLength) :: UTF8 -> Int +byteLength (UTF8 arr) = Array.length arr + +INLINEABLE(compareLength) :: UTF8 -> Int -> Ordering +compareLength text i = unsafePerformIO $ do + res <- + read text + & Stream.scan Fold.length + & Stream.fold (Fold.takeEndBy (<= i + 1) Fold.latest) + pure + $ case res of + Nothing -> LT + Just j -> compare j i + +-------------------------------------------------------------------------------- +-- Transformation +-------------------------------------------------------------------------------- + +INLINEABLE(map) :: (Char -> Char) -> UTF8 -> UTF8 +map f text = unsafePerformIO $ do + read text + & fmap f + & Stream.fold create + +INLINEABLE(intercalate) :: UTF8 -> [UTF8] -> UTF8 +intercalate seed inp = + Stream.intercalate + (Unfold.function id) + (unUTF8 seed) + (fmap unUTF8 (Stream.fromList inp)) + & Array.fromChunks + & fmap UTF8 + & unsafePerformIO + +INLINEABLE(intersperse) :: Char -> UTF8 -> UTF8 +intersperse c text = + read text + & Stream.intersperse c + & Stream.fold create + & unsafePerformIO + +-- XXX This is a naive implementation to get things to work. +INLINEABLE(transpose) :: [UTF8] -> [UTF8] +transpose ts = P.map pack (L.transpose (P.map unpack ts)) + +INLINEABLE(reverse) :: UTF8 -> UTF8 +reverse (UTF8 arr) = + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & fmap fst + & Stream.fold copyFold + & fmap (UTF8 . Array.unsafeFreeze) + & unsafePerformIO + + where + + copyFold = + Fold.foldrM' + (\(o, l) b -> + MArray.spliceUnsafe b + (Array.unsafeThaw (Array.getSliceUnsafe o l arr))) + (MArray.emptyOf (Array.length arr)) + +-- Use Boyer–Moore for strict pattern replacement? +-- Check perf and decide. +INLINEABLE(replace) + :: UTF8 -- ^ needle to search for. If this string is empty, an + -- error will occur. + -> UTF8 -- ^ replacement to replace needle with. + -> UTF8 -- ^ haystack in which to search. + -> UTF8 +UNIMPLEMENTED(replace) +{- +replace (UTF8 needle) (UTF8 replacement) (UTF8 haystack) = + Stream.splitOnSeq needle replicate (Array.read haystack) + & fromStream & unsafePerformIO +-} + +-------------------------------------------------------------------------------- +-- Case conversion +-------------------------------------------------------------------------------- + +INLINEABLE(toCaseFold) :: UTF8 -> UTF8 +toCaseFold text = + read text + & fmap (UUnfold.toList Char.caseFoldMapping) + & Stream.unfoldMany Unfold.fromList + & fromStream & unsafePerformIO + +INLINEABLE(toLower) :: UTF8 -> UTF8 +toLower text = + read text + & fmap (UUnfold.toList Char.lowerCaseMapping) + & Stream.unfoldMany Unfold.fromList + & fromStream & unsafePerformIO + +INLINEABLE(toUpper) :: UTF8 -> UTF8 +toUpper text = + read text + & fmap (UUnfold.toList Char.upperCaseMapping) + & Stream.unfoldMany Unfold.fromList + & fromStream & unsafePerformIO + +INLINEABLE(toTitle) :: UTF8 -> UTF8 +toTitle text = + read text + & fmap (UUnfold.toList Char.titleCaseMapping) + & Stream.unfoldMany Unfold.fromList + & fromStream & unsafePerformIO + +-------------------------------------------------------------------------------- +-- Justification +-------------------------------------------------------------------------------- + +INLINEABLE(justifyLeft) :: Int -> Char -> UTF8 -> UTF8 +justifyLeft i c text@(UTF8 arr) = unsafePerformIO $ do + len <- Stream.fold Fold.length $ read text + (UTF8 padding) <- fromStream $ Stream.fromPure c + new0 <- MArray.emptyOf (Array.length arr + (Array.length padding) * i) + if i > len + then do + new1 <- + Stream.replicate (i - len) (Array.unsafeThaw padding) + & Stream.fold (Fold.foldrM' MArray.spliceUnsafe (pure new0)) + new2 <- MArray.spliceUnsafe new1 (Array.unsafeThaw arr) + pure $ UTF8 $ Array.unsafeFreeze new2 + else pure text + +INLINEABLE(justifyRight) :: Int -> Char -> UTF8 -> UTF8 +justifyRight i c text@(UTF8 arr) = unsafePerformIO $ do + len <- Stream.fold Fold.length $ read text + (UTF8 padding) <- fromStream $ Stream.fromPure c + new0 <- MArray.emptyOf (Array.length arr + (Array.length padding) * i) + if i > len + then do + new1 <- MArray.spliceUnsafe new0 (Array.unsafeThaw arr) + new2 <- + Stream.replicate (i - len) (Array.unsafeThaw padding) + & Stream.fold (Fold.foldrM' MArray.spliceUnsafe (pure new1)) + pure $ UTF8 $ Array.unsafeFreeze new2 + else pure text + +INLINEABLE(center) :: Int -> Char -> UTF8 -> UTF8 +center i c text@(UTF8 arr) = unsafePerformIO $ do + len <- Stream.fold Fold.length $ read text + (UTF8 padding) <- fromStreamOf 4 (Stream.fromPure c) + new0 <- MArray.emptyOf (Array.length arr + 4 * i) + let paddingLenL = (i - len) `div` 2 + paddingLenR = i - len - paddingLenL + if i > len + then do + new1 <- + Stream.replicate paddingLenL (Array.unsafeThaw padding) + & Stream.fold (Fold.foldrM' MArray.spliceUnsafe (pure new0)) + new2 <- MArray.spliceUnsafe new1 (Array.unsafeThaw arr) + new3 <- + Stream.replicate paddingLenR (Array.unsafeThaw padding) + & Stream.fold (Fold.foldrM' MArray.spliceUnsafe (pure new2)) + pure $ UTF8 $ Array.unsafeFreeze new3 + else pure text + +-------------------------------------------------------------------------------- +-- Folds +-------------------------------------------------------------------------------- + +INLINEABLE(fold) :: MonadIO m => Fold m Char b -> UTF8 -> m b +fold f text = Stream.fold f $ read text + +INLINEABLE(concat) :: [UTF8] -> UTF8 +concat = + unsafePerformIO + . fmap UTF8 . Array.fromChunks . fmap unUTF8 . Stream.fromList + +INLINEABLE(concatMap) :: (Char -> UTF8) -> UTF8 -> UTF8 +concatMap f0 text = + read text + & StreamK.fromStream + & StreamK.concatMap f + & StreamK.toStream + & fromStream + & unsafePerformIO + where + f c = StreamK.fromStream $ read (f0 c) + +INLINEABLE(any) :: (Char -> Bool) -> UTF8 -> Bool +any f inp = + read inp + & Stream.fold (Fold.any f) + & unsafePerformIO + +INLINEABLE(all) :: (Char -> Bool) -> UTF8 -> Bool +all f inp = + read inp + & Stream.fold (Fold.all f) + & unsafePerformIO + +INLINEABLE(maximum) :: UTF8 -> Maybe Char +maximum inp = + read inp + & Stream.fold Fold.maximum + & unsafePerformIO + +INLINEABLE(minumum) :: UTF8 -> Maybe Char +minumum inp = + read inp + & Stream.fold Fold.minimum + & unsafePerformIO + +INLINEABLE(isAscii) :: UTF8 -> Bool +isAscii = all Char.isAscii + +-------------------------------------------------------------------------------- +-- Construction +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +-- Scans +-------------------------------------------------------------------------------- + +INLINEABLE(scan) :: Fold Identity Char Char -> UTF8 -> UTF8 +scan fld text = + read text + & Stream.scan fld + & Stream.fold (Fold.morphInner (Identity . unsafePerformIO) create) + & runIdentity + +INLINEABLE(mapAccum) :: Fold Identity Char (a, Char) -> UTF8 -> (Maybe a, UTF8) +mapAccum fld text = + read text + & Stream.scan fld + & Stream.fold + (Fold.tee + (Fold.lmap fst Fold.latest) + (Fold.lmap snd + (Fold.morphInner (Identity . unsafePerformIO) create))) + & runIdentity + +-------------------------------------------------------------------------------- +-- Unfolding +-------------------------------------------------------------------------------- + +INLINEABLE(replicate) :: Int -> UTF8 -> UTF8 +replicate i (UTF8 arr) = + Stream.replicate i (Array.unsafeThaw arr) + & Stream.fold + (Fold.foldrM' MArray.spliceExp (MArray.emptyOf MArray.blockSize)) + & fmap (UTF8 . Array.unsafeFreeze) + & unsafePerformIO + +INLINEABLE(unfoldr) :: (s -> Maybe (Char, s)) -> s -> UTF8 +unfoldr f s = + Stream.unfoldr f s + & fromStream & unsafePerformIO + +INLINEABLE(unfoldrN) :: Int -> (s -> Maybe (Char, s)) -> s -> UTF8 +unfoldrN i f s = + Stream.unfoldr f s + & fromStreamOf (4 * i) & unsafePerformIO + +-------------------------------------------------------------------------------- +-- Breaking +-------------------------------------------------------------------------------- + +INLINEABLE(take) :: Int -> UTF8 -> UTF8 +take i text@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.take i + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe 0 (off + len) arr) + +INLINEABLE(takeEnd) :: Int -> UTF8 -> UTF8 +takeEnd i text@(UTF8 arr) = unsafePerformIO $ do + res <- + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.take i + & Stream.fold Fold.latest + let arrLen = Array.length arr + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe (arrLen - (off + len)) arrLen arr) + +INLINEABLE(drop) :: Int -> UTF8 -> UTF8 +drop i text@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.drop i + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe (off + len) len arr) + +INLINEABLE(dropEnd) :: Int -> UTF8 -> UTF8 +dropEnd i text@(UTF8 arr) = unsafePerformIO $ do + res <- + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.drop i + & Stream.fold Fold.latest + let arrLen = Array.length arr + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe 0 (arrLen - (off + len)) arr) + +INLINEABLE(takeWhile) :: (Char -> Bool) -> UTF8 -> UTF8 +takeWhile p text@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.takeWhile (p . snd) + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe 0 (off + len) arr) + +INLINEABLE(takeWhileEnd) :: (Char -> Bool) -> UTF8 -> UTF8 +takeWhileEnd p text@(UTF8 arr) = unsafePerformIO $ do + res <- + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.takeWhile (p . snd) + & Stream.fold Fold.latest + let arrLen = Array.length arr + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe (arrLen - (off + len)) arrLen arr) + +INLINEABLE(dropWhile) :: (Char -> Bool) -> UTF8 -> UTF8 +dropWhile p text@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.dropWhile (p . snd) + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe (off + len) len arr) + +INLINEABLE(dropWhileEnd) :: (Char -> Bool) -> UTF8 -> UTF8 +dropWhileEnd p text@(UTF8 arr) = unsafePerformIO $ do + res <- + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.dropWhile (p . snd) + & Stream.fold Fold.latest + let arrLen = Array.length arr + pure + $ case res of + Nothing -> text + Just ((off, len), _) -> + UTF8 (Array.getSliceUnsafe 0 (arrLen - (off + len)) arr) + +INLINEABLE(dropAround) :: (Char -> Bool) -> UTF8 -> UTF8 +dropAround p text@(UTF8 arr) = unsafePerformIO $ do + beg <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.dropWhile (p . snd) + & Stream.fold Fold.latest + end <- + Array.readRev arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.dropWhile (p . snd) + & Stream.fold Fold.latest + let arrLen = Array.length arr + pure + $ case (beg, end) of + (Just ((boff, _), _), Just ((eoff, _), _)) -> + UTF8 (Array.getSliceUnsafe boff eoff arr) + (Just ((boff, _), _), Nothing) -> + UTF8 (Array.getSliceUnsafe boff arrLen arr) + (Nothing, Just ((eoff, _), _)) -> + UTF8 (Array.getSliceUnsafe 0 eoff arr) + (Nothing, Nothing) -> text + +INLINEABLE(strip) :: UTF8 -> UTF8 +strip = dropAround Char.isWhiteSpace + +INLINEABLE(stripStart) :: UTF8 -> UTF8 +stripStart = dropWhile Char.isWhiteSpace + +INLINEABLE(stripEnd) :: UTF8 -> UTF8 +stripEnd = dropWhileEnd Char.isWhiteSpace + +INLINEABLE(splitAt) :: Int -> UTF8 -> (UTF8, UTF8) +splitAt i str@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.take i + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> (empty, str) + Just ((off, _), _) -> + ( UTF8 (Array.getSliceUnsafe 0 off arr) + , UTF8 (Array.getSliceUnsafe off (Array.length arr - off) arr) + ) + +INLINEABLE(breakOn) :: UTF8 -> UTF8 -> (UTF8, UTF8) +UNIMPLEMENTED(breakOn) + +INLINEABLE(breakOnEnd) :: UTF8 -> UTF8 -> (UTF8, UTF8) +UNIMPLEMENTED(breakOnEnd) + +INLINEABLE(break) :: (Char -> Bool) -> UTF8 -> (UTF8, UTF8) +break p str@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.fold (Fold.find (p . snd)) + pure + $ case res of + Nothing -> (empty, str) + Just ((off, _), _) -> + ( UTF8 (Array.getSliceUnsafe 0 off arr) + , UTF8 (Array.getSliceUnsafe off (Array.length arr - off) arr) + ) + +INLINEABLE(span) :: (Char -> Bool) -> UTF8 -> (UTF8, UTF8) +span p str@(UTF8 arr) = unsafePerformIO $ do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.takeWhile (p . snd) + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> (empty, str) + Just ((off, _), _) -> + ( UTF8 (Array.getSliceUnsafe 0 off arr) + , UTF8 (Array.getSliceUnsafe off (Array.length arr - off) arr) + ) + +INLINEABLE(spanM) :: Monad m => (Char -> m Bool) -> UTF8 -> m (UTF8, UTF8) +spanM p str@(UTF8 arr) = do + res <- + Unicode.decodeUtf8Indexed Unicode.ErrorOnCodingFailure (Array.read arr) + & Stream.takeWhileM (p . snd) + & Stream.fold Fold.latest + pure + $ case res of + Nothing -> (empty, str) + Just ((off, _), _) -> + ( UTF8 (Array.getSliceUnsafe 0 off arr) + , UTF8 (Array.getSliceUnsafe off (Array.length arr - off) arr) + ) + +INLINEABLE(spanEndM) :: Monad m => (Char -> m Bool) -> UTF8 -> m (UTF8, UTF8) +spanEndM p str@(UTF8 arr) = do + res <- + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.takeWhileM (p . snd) + & Stream.fold Fold.latest + let arrLen = Array.length arr + pure + $ case res of + Nothing -> (empty, str) + Just ((off, _), _) -> + ( UTF8 (Array.getSliceUnsafe 0 (arrLen - off) arr) + , UTF8 (Array.getSliceUnsafe (arrLen - off) arrLen arr) + ) + +INLINEABLE(group) :: UTF8 -> [UTF8] +group (UTF8 arr) = + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.parseMany (Parser.groupBy eq fld) + & Stream.catRights -- There shouldn't be any parse error here! + & fmap UTF8 + & Stream.toList + & unsafePerformIO + + where + eq a b = snd a == snd b + fld = slicer <$> Fold.tee Fold.one Fold.latest + slicer (Just ((os, _), _), Just ((oe, le), _)) = + Array.getSliceUnsafe os (oe + le) arr + slicer _ = Array.empty + +INLINEABLE(groupBy) :: (Char -> Char -> Bool) -> UTF8 -> [UTF8] +groupBy p (UTF8 arr) = + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.parseMany (Parser.groupBy eq fld) + & Stream.catRights -- There shouldn't be any parse error here! + & fmap UTF8 + & Stream.toList + & unsafePerformIO + + where + eq a b = p (snd a) (snd b) + fld = slicer <$> Fold.tee Fold.one Fold.latest + slicer (Just ((os, _), _), Just ((oe, le), _)) = + Array.getSliceUnsafe os (oe + le) arr + slicer _ = Array.empty + +INLINEABLE(inits) :: UTF8 -> [UTF8] +inits (UTF8 arr) = + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & fmap slicer + & fmap UTF8 + & Stream.toList + & fmap (empty:) + & unsafePerformIO + where + slicer ((o, l), _) = Array.getSliceUnsafe 0 (o + l) arr + +INLINEABLE(tails) :: UTF8 -> [UTF8] +tails (UTF8 arr) = + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & fmap slicer + & fmap UTF8 + & Stream.fold Fold.toListRev + & fmap (P.reverse . (empty:)) + & unsafePerformIO + where + slicer ((o, _), _) = Array.getSliceUnsafe o (Array.length arr) arr + +-------------------------------------------------------------------------------- +-- Breaking into many substrings +-------------------------------------------------------------------------------- + +INLINEABLE(splitOn) :: UTF8 -> UTF8 -> [UTF8] +splitOn (UTF8 pat) (UTF8 arr) = + Array.read arr + & Stream.zipWith (,) Stream.enumerate + & Stream.splitOnSeqWith pat snd fld + & fmap UTF8 + & Stream.toList + & unsafePerformIO + where + fld = slicer <$> Fold.tee Fold.one Fold.latest + slicer (Just (s, _), Just (l, _)) = + Array.getSliceUnsafe s (l + 1) arr + slicer _ = Array.empty + +INLINEABLE(split) :: (Char -> Bool) -> UTF8 -> [UTF8] +split p (UTF8 arr) = + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.splitOn (p . snd) fld + & fmap UTF8 + & Stream.toList + & unsafePerformIO + where + fld = slicer <$> Fold.tee Fold.one Fold.latest + slicer (Just ((os, _), _), Just ((oe, le), _)) = + Array.getSliceUnsafe os (oe + le) arr + slicer _ = Array.empty + +INLINEABLE(chunksOf) :: Int -> UTF8 -> [UTF8] +chunksOf i (UTF8 arr) = + Array.read arr + & Unicode.decodeUtf8ReverseIndexed Unicode.ErrorOnCodingFailure + & Stream.groupsOf i fld + & fmap UTF8 + & Stream.toList + & unsafePerformIO + where + fld = slicer <$> Fold.tee Fold.one Fold.latest + slicer (Just ((os, _), _), Just ((oe, le), _)) = + Array.getSliceUnsafe os (oe + le) arr + slicer _ = Array.empty + +-------------------------------------------------------------------------------- +-- Breaking into lines and words +-------------------------------------------------------------------------------- + +-- XXX isLineSeperator? +INLINEABLE(lines) :: UTF8 -> [UTF8] +lines = split (== '\n') + +INLINEABLE(words) :: UTF8 -> [UTF8] +words = split Char.isWhiteSpace + +INLINEABLE(unlines) :: [UTF8] -> UTF8 +unlines = intercalate (singleton '\n') + +INLINEABLE(unwords) :: [UTF8] -> UTF8 +unwords = intercalate (singleton ' ') + +-------------------------------------------------------------------------------- +-- Predicates +-------------------------------------------------------------------------------- + +INLINEABLE(isPrefixOf) :: UTF8 -> UTF8 -> Bool +UNIMPLEMENTED(isPrefixOf) + +INLINEABLE(isSuffixOf) :: UTF8 -> UTF8 -> Bool +UNIMPLEMENTED(isSuffixOf) + +INLINEABLE(isInfixOf) :: UTF8 -> UTF8 -> Bool +UNIMPLEMENTED(isInfixOf) + +-------------------------------------------------------------------------------- +-- View Patterns +-------------------------------------------------------------------------------- + +INLINEABLE(stripPrefix) :: UTF8 -> UTF8 -> Bool +UNIMPLEMENTED(stripPrefix) + +INLINEABLE(stripSuffix) :: UTF8 -> UTF8 -> Bool +UNIMPLEMENTED(stripSuffix) + +INLINEABLE(commonPrefixes) :: UTF8 -> UTF8 -> Maybe (UTF8, UTF8, UTF8) +UNIMPLEMENTED(commonPrefixes) + +-------------------------------------------------------------------------------- +-- Searching +-------------------------------------------------------------------------------- + +INLINEABLE(filter) :: (Char -> Bool) -> UTF8 -> UTF8 +filter p inp = + read inp + & Stream.filter p + & Stream.fold (createOf (byteLength inp)) + & unsafePerformIO + +INLINEABLE(breakOnAll) + :: UTF8 -- ^ needle to search for + -> UTF8 -- ^ haystack in which to search + -> [(UTF8, UTF8)] +UNIMPLEMENTED(breakOnAll) + +INLINEABLE(find) :: (Char -> Bool) -> UTF8 -> Maybe Char +find p inp = + read inp + & Stream.fold (Fold.find p) + & unsafePerformIO + +INLINEABLE(elem) :: Char -> UTF8 -> Bool +elem c inp = + read inp + & Stream.fold (Fold.elem c) + & unsafePerformIO + +INLINEABLE(partition) :: (Char -> Bool) -> UTF8 -> (UTF8, UTF8) +partition p inp = + read inp + & Stream.filter p + & Stream.fold (Fold.tee (Fold.filter p create) (Fold.filter p create)) + & unsafePerformIO + +-------------------------------------------------------------------------------- +-- Indexing +-------------------------------------------------------------------------------- + +INLINEABLE(index) :: UTF8 -> Int -> Maybe Char +index inp i = + read inp + & Stream.fold (Fold.index i) + & unsafePerformIO + +INLINEABLE(findIndex) :: (Char -> Bool) -> UTF8 -> Maybe Int +findIndex p inp = + read inp + & Stream.fold (Fold.findIndex p) + & unsafePerformIO + +-------------------------------------------------------------------------------- +-- Zipping +-------------------------------------------------------------------------------- + +INLINEABLE(zip) :: UTF8 -> UTF8 -> [(Char, Char)] +zip l r = + Stream.zipWith (,) (read l) (read r) + & Stream.toList + & unsafePerformIO + +-------------------------------------------------------------------------------- +-- Low level operations +-------------------------------------------------------------------------------- + +INLINEABLE(copy) :: UTF8 -> UTF8 +copy (UTF8 arr) = unsafePerformIO $ do + new <- MArray.emptyOf (Array.length arr) + new1 <- MArray.spliceUnsafe new (Array.unsafeThaw arr) + pure $ UTF8 $ Array.unsafeFreeze new1 + +INLINEABLE(unpackCString#) :: Addr# -> UTF8 +unpackCString# addr = + Unicode.fromStr# addr + & fromStream + & unsafePerformIO + +INLINEABLE(unpackCStringAscii#) :: Addr# -> UTF8 +unpackCStringAscii# addr = UTF8 (Array.fromByteStr# addr) + +INLINEABLE(measureOff) :: Int -> UTF8 -> UTF8 +UNIMPLEMENTED(measureOff) diff --git a/core/streamly-core.cabal b/core/streamly-core.cabal index 8a2fdd1f3f..07d68044a3 100644 --- a/core/streamly-core.cabal +++ b/core/streamly-core.cabal @@ -374,6 +374,7 @@ library -- streamly-unicode-core , Streamly.Internal.Unicode.Stream , Streamly.Internal.Unicode.String + , Streamly.Internal.Unicode.UTF8 , Streamly.Internal.Unicode.Parser , Streamly.Internal.Unicode.Array @@ -528,6 +529,7 @@ library , filepath >= 1.4.2 && < 1.6 -- streamly-unicode-core + , unicode-data >= 0.4 && < 0.5 , template-haskell >= 2.14 && < 2.22 -- streamly-filesystem-core diff --git a/default.nix b/default.nix index 29c95d06d2..e320073efc 100644 --- a/default.nix +++ b/default.nix @@ -71,12 +71,12 @@ let haskellPackages = sha256 = "1bj9agy3x0yjbscpjgn96gpnj4lvkh39spjvy3jnrr3a42v3ynw7"; } {}; - #unicode-data = - # super.callHackageDirect - # { pkg = "unicode-data"; - # ver = "0.2.0"; - # sha256 = "14crb68g79yyw87fgh49z2fn4glqx0zr53v6mapihaxzkikhkkc3"; - # } {}; + unicode-data = + super.callHackageDirect + { pkg = "unicode-data"; + ver = "0.4.0.1"; + sha256 = "sha256-aLkjgYkNldnobA0PlFkmuPQSHQQYxoxyR4ibk2bgiUI="; + } {}; #tasty-bench = super.tasty-bench_0_3_1; diff --git a/src/Streamly/Internal/Unicode/Utf8.hs b/src/Streamly/Internal/Unicode/Utf8.hs deleted file mode 100644 index 1f9877a123..0000000000 --- a/src/Streamly/Internal/Unicode/Utf8.hs +++ /dev/null @@ -1,73 +0,0 @@ --- | --- Module : Streamly.Internal.Unicode.Utf8 --- Copyright : (c) 2021 Composewell Technologies --- License : BSD-3-Clause --- Maintainer : streamly@composewell.com --- Stability : experimental --- Portability : GHC - --- XXX We can move this to stream-core/streamly-unicode-core, and provide an --- additional module in streamly-unicode for case conversions (because it --- depends on unicode-data). Or just keep all of it in streamly-unicode --- which will have a dependency on unicode-data. - -module Streamly.Internal.Unicode.Utf8 - ( - -- * Type - Utf8 - - -- * Creation and elimination - , pack - , unpack - , toArray - ) -where - --------------------------------------------------------------------------------- --- Imports --------------------------------------------------------------------------------- - -import Data.Word (Word8) -import Streamly.Data.Array (Array) -import System.IO.Unsafe (unsafePerformIO) - -import qualified Streamly.Data.Fold as Fold -import qualified Streamly.Data.Stream as Stream -import qualified Streamly.Internal.Data.Array as Array - ( fromStreamN - , read - ) -import qualified Streamly.Internal.Unicode.Stream as Unicode - --------------------------------------------------------------------------------- --- Type --------------------------------------------------------------------------------- - --- | A space efficient, packed, unboxed Unicode container. -newtype Utf8 = Utf8 (Array Word8) - --------------------------------------------------------------------------------- --- Functions --------------------------------------------------------------------------------- - -{-# INLINE toArray #-} -toArray :: Utf8 -> Array Word8 -toArray (Utf8 arr) = arr - - -{-# INLINEABLE pack #-} -pack :: String -> Utf8 -pack s = - Utf8 - $ unsafePerformIO - $ Array.fromStreamN len $ Unicode.encodeUtf8' $ Stream.fromList s - - where - - len = length s - -{-# INLINEABLE unpack #-} -unpack :: Utf8 -> String -unpack u = - unsafePerformIO - $ Stream.fold Fold.toList $ Unicode.decodeUtf8' $ Array.read $ toArray u