diff --git a/bench/bench.hs b/bench/bench.hs index a26ab8f..36209f9 100644 --- a/bench/bench.hs +++ b/bench/bench.hs @@ -25,6 +25,7 @@ import Test.QuickCheck (vectorOf, arbitrary) import Test.Warden.Arbitrary import Warden.Data +import Warden.Numeric import Warden.Row import Warden.View @@ -55,6 +56,10 @@ prepareHashText :: IO [ByteString] prepareHashText = generate' (Deterministic 54321) (GenSize 30) $ vectorOf 1000 arbitrary +prepareNumbers :: IO [Double] +prepareNumbers = + generate' (Deterministic 2468) (GenSize 100) $ vectorOf 10000 arbitrary + prepareFolds :: IO ([Row], [ByteString]) prepareFolds = (,) <$> prepareSVParse <*> prepareHashText @@ -80,6 +85,9 @@ benchHashText = fmap hashText benchUpdateTextCounts :: [Row] -> TextCounts benchUpdateTextCounts rs = foldl' (flip (updateTextCounts (TextFreeformThreshold 100))) NoTextCounts rs +benchUpdateNumericState :: [Double] -> NumericState +benchUpdateNumericState ns = foldl' updateNumericState initialNumericState ns + main :: IO () main = do withTempDirectory "." "warden-bench-" $ \root -> @@ -98,4 +106,8 @@ main = do , bench "hashText/1000" $ nf benchHashText ts , bench "updateTextCounts/1000" $ nf benchUpdateTextCounts rs ] + , env prepareNumbers $ \ ~(ns) -> + bgroup "numerics" $ [ + bench "updateNumericState/10000" $ nf benchUpdateNumericState ns + ] ] diff --git a/src/Warden/Data/Numeric.hs b/src/Warden/Data/Numeric.hs index 3fd9a95..bd6ab1b 100644 --- a/src/Warden/Data/Numeric.hs +++ b/src/Warden/Data/Numeric.hs @@ -2,29 +2,41 @@ {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE DeriveGeneric #-} module Warden.Data.Numeric ( - Minimum(..) + KAcc(..) , Maximum(..) , Mean(..) - , Count(..) + , MeanAcc(..) + , MeanDevAcc(..) , Median(..) - , StdDev(..) + , Minimum(..) + , NumericState(..) , NumericSummary(..) + , StdDev(..) + , StdDevAcc(..) , Variance(..) - , fromVariance + , initialNumericState , mkStdDev + , stateMaximum + , stateMeanDev + , stateMinimum ) where -import Data.Aeson -import Data.Aeson.Types +import Control.Lens (makeLenses) + +import GHC.Generics (Generic) import P data Minimum = Minimum {-# UNPACK #-} !Double | NoMinimum - deriving (Eq, Show) + deriving (Eq, Show, Generic) + +instance NFData Minimum instance Monoid Minimum where mempty = NoMinimum @@ -41,7 +53,9 @@ instance Monoid Minimum where data Maximum = Maximum {-# UNPACK #-} !Double | NoMaximum - deriving (Eq, Show) + deriving (Eq, Show, Generic) + +instance NFData Maximum instance Monoid Maximum where mempty = NoMaximum @@ -55,42 +69,102 @@ instance Monoid Maximum where else Maximum prev {-# INLINE mappend #-} -newtype Count = Count { getCount :: Int } - deriving (Eq, Show, ToJSON, FromJSON) +-- | Counter param for mean/stddev calculation. Equal to one plus the number +-- of records seen. +newtype KAcc = + KAcc { + getKAcc :: Int + } deriving (Eq, Show, Generic, Num) + +instance NFData KAcc -newtype Mean = Mean { getMean :: Double } - deriving (Eq, Show, ToJSON, FromJSON) +-- | Preliminary mean, still accumulating. +newtype MeanAcc = + MeanAcc { + unMeanAcc :: Double + } deriving (Eq, Show, Generic) + +instance NFData MeanAcc + +-- | Final mean. +data Mean = + NoMean + | Mean {-# UNPACK #-} !Double + deriving (Eq, Show, Generic) + +instance NFData Mean data Median = Median {-# UNPACK #-} !Double | NoMedian - deriving (Eq, Show) + deriving (Eq, Show, Generic) -newtype Variance = Variance { getVariance :: Double } - deriving (Eq, Show, ToJSON, FromJSON) +instance NFData Median -fromVariance :: Variance -> StdDev -fromVariance = StdDev . sqrt . getVariance +-- | Accumulator for standard deviation calculation. Closer to variance than +-- standard deviation to avoid repeated square roots. +-- +-- \( acc = \sigma^{2} (k - 1) \) +-- +-- Where `acc` is 'StdDevAcc' and `k` is the 'KAcc'. +newtype StdDevAcc = + StdDevAcc { + unStdDevAcc :: Double + } deriving (Eq, Show, Generic) -newtype StdDev = StdDev { getStdDev :: Double } - deriving (Eq, Show, ToJSON) +instance NFData StdDevAcc -mkStdDev :: Double -> Maybe StdDev -mkStdDev v - | v < 0.0 = Nothing - | otherwise = Just $ StdDev v +newtype Variance = + Variance { + unVariance :: Double + } deriving (Eq, Show, Generic) + +instance NFData Variance -instance FromJSON StdDev where - parseJSON (Number v) = case mkStdDev ((fromRational . toRational) v) of - Nothing -> fail "StdDev must not be negative" - Just v' -> pure v' - parseJSON x = typeMismatch "StdDev" x +data StdDev = + NoStdDev + | StdDev {-# UNPACK #-} !Double + deriving (Eq, Show, Generic) + +instance NFData StdDev + +mkStdDev :: Double -> StdDev +mkStdDev v + | v < 0.0 = NoStdDev + | otherwise = StdDev v -- | So we can cheaply keep track of long-term change in numeric datasets. -- Will probably also end up in brandix. data NumericSummary = NumericSummary !Minimum !Maximum - {-# UNPACK #-} !Mean - {-# UNPACK #-} !StdDev + !Mean + !StdDev !Median - deriving (Eq, Show) + deriving (Eq, Show, Generic) + +instance NFData NumericSummary + +data MeanDevAcc = + MeanDevInitial + | MeanDevAcc {-# UNPACK #-} !MeanAcc !(Maybe StdDevAcc) {-# UNPACK #-} !KAcc + deriving (Eq, Show, Generic) + +instance NFData MeanDevAcc + +data NumericState = + NumericState { + _stateMinimum :: !Minimum + , _stateMaximum :: !Maximum + , _stateMeanDev :: !MeanDevAcc + } deriving (Eq, Show, Generic) + +instance NFData NumericState + +makeLenses ''NumericState + +initialNumericState :: NumericState +initialNumericState = + NumericState + NoMinimum + NoMaximum + MeanDevInitial diff --git a/src/Warden/Data/Row.hs b/src/Warden/Data/Row.hs index 4e291b4..6e69c65 100644 --- a/src/Warden/Data/Row.hs +++ b/src/Warden/Data/Row.hs @@ -38,7 +38,7 @@ module Warden.Data.Row ( , totalRows ) where -import Control.Lens +import Control.Lens (makeLenses, (^.), (%~)) import Data.ByteString (ByteString) import Data.Char (chr, ord) diff --git a/src/Warden/Numeric.hs b/src/Warden/Numeric.hs index d58c8f5..0a54f1b 100644 --- a/src/Warden/Numeric.hs +++ b/src/Warden/Numeric.hs @@ -2,22 +2,26 @@ {-# LANGUAGE BangPatterns #-} module Warden.Numeric ( - MeanDevAcc(..) + combineMeanAcc + , combineMeanDevAcc + , combineStdDevAcc + , finalizeMeanDev + , finalizeStdDevAcc + , stdDevAccFromVariance + , summarizeNumericState , updateMinimum , updateMaximum , updateMeanDev - , finalizeMeanDev + , updateNumericState + , varianceFromStdDevAcc ) where +import Control.Lens ((%~), (^.)) + import P import Warden.Data -data MeanDevAcc = - MeanDevInitial - | MeanDevAcc {-# UNPACK #-} !Mean !(Maybe Variance) {-# UNPACK #-} !Count - deriving (Eq, Show) - updateMinimum :: Real a => Minimum -> a -> Minimum updateMinimum !acc x = @@ -32,7 +36,9 @@ updateMaximum !acc x = in acc <> x' {-# INLINE updateMaximum #-} --- Numerically-stable mean and variance. +-- Minimal-error mean and variance. +-- +-- From Knuth (TAoCP v2, Seminumerical Algorithms, p232). -- -- \( \frac{1}{n} \sum_{x \in X} x \equiv M_1 = X_1, M_k = M_{k-1} + \frac{(X_k - M_{k-1})}{k} \) updateMeanDev :: Real a @@ -40,24 +46,126 @@ updateMeanDev :: Real a updateMeanDev !macc x = let x' = (fromRational . toRational) x in case macc of MeanDevInitial -> - let i = Count 1 - m = Mean 0 + let i = KAcc 1 + m = MeanAcc 0 s = Nothing in update' m s i x' (MeanDevAcc m s i) -> update' m s i x' where - update' (Mean m) s (Count i) v = + update' (MeanAcc m) s (KAcc i) v = let delta = v - m - m' = Mean $ m + delta / (fromIntegral i) - i' = Count $ i + 1 + m' = MeanAcc $ m + delta / (fromIntegral i) + i' = KAcc $ i + 1 s' = case s of - Nothing -> Just $ Variance 0 - Just (Variance var) -> Just . Variance $ var + delta * (v - (getMean m')) + Nothing -> + Just $ StdDevAcc 0 + Just (StdDevAcc sda) -> + Just . StdDevAcc $ sda + (delta * (v - (unMeanAcc m'))) in MeanDevAcc m' s' i' {-# INLINE updateMeanDev #-} -finalizeMeanDev :: MeanDevAcc -> Maybe (Mean, StdDev) -finalizeMeanDev MeanDevInitial = Nothing -finalizeMeanDev (MeanDevAcc _ Nothing _) = Nothing -finalizeMeanDev (MeanDevAcc mn (Just var) _) = Just (mn, fromVariance var) +finalizeMeanDev :: MeanDevAcc -> (Mean, StdDev) +finalizeMeanDev MeanDevInitial = (NoMean, NoStdDev) +finalizeMeanDev (MeanDevAcc _ Nothing _) = (NoMean, NoStdDev) +finalizeMeanDev (MeanDevAcc mn (Just sda) n) = (Mean (unMeanAcc mn), finalizeStdDevAcc n sda) + +-- FIXME: median +updateNumericState :: Real a + => NumericState -> a -> NumericState +updateNumericState acc x = + (stateMinimum %~ (flip updateMinimum x)) + . (stateMaximum %~ (flip updateMaximum x)) + . (stateMeanDev %~ (flip updateMeanDev x)) + $!! acc +{-# INLINE updateNumericState #-} + +summarizeNumericState :: NumericState -> NumericSummary +summarizeNumericState st = + let (mn, stddev) = finalizeMeanDev $ st ^. stateMeanDev in + NumericSummary + (st ^. stateMinimum) + (st ^. stateMaximum) + mn + stddev + NoMedian + +-- FIXME: this might commute error, requires further thought. +combineMeanDevAcc :: MeanDevAcc -> MeanDevAcc -> MeanDevAcc +combineMeanDevAcc MeanDevInitial MeanDevInitial = MeanDevInitial +combineMeanDevAcc MeanDevInitial md2 = md2 +combineMeanDevAcc md1 MeanDevInitial = md1 +combineMeanDevAcc (MeanDevAcc mu1 s1 c1) (MeanDevAcc mu2 s2 c2) = + let mu' = combineMeanAcc (mu1, c1) (mu2, c2) + sda' = combineStdDevAcc mu' (mu1, s1, c1) (mu2, s2, c2) + -- KAccs are off-by-one from the actual number of values seen, so + -- subtract one from the sum to prevent it becoming off-by-two. + c' = c1 + c2 - (KAcc 1) in + MeanDevAcc mu' sda' c' +{-# INLINE combineMeanDevAcc #-} + +-- | Combine stddev accumulators of two subsets by converting to variance +-- (pretty cheap), combining the variances (less cheap), and converting back. +-- +-- There's almost certainly a better way to do this. +combineStdDevAcc :: MeanAcc -- ^ Combined mean. + -> (MeanAcc, Maybe StdDevAcc, KAcc) -- ^ First subset. + -> (MeanAcc, Maybe StdDevAcc, KAcc) -- ^ Second subset. + -> Maybe StdDevAcc +combineStdDevAcc _ (_, Nothing, _) (_, Nothing, _) = + Nothing +combineStdDevAcc _ (_, Just (StdDevAcc s1), _) (_, Nothing, _) = + Just $ StdDevAcc s1 +combineStdDevAcc _ (_, Nothing, _) (_, Just (StdDevAcc s2), _) = + Just $ StdDevAcc s2 +combineStdDevAcc muHat (mu1, Just sda1, c1) (mu2, Just sda2, c2) = + let var1 = varianceFromStdDevAcc c1 sda1 + var2 = varianceFromStdDevAcc c2 sda2 in + Just . stdDevAccFromVariance (c1 + c2 - (KAcc 1)) $ + combineVariance muHat (mu1, var1, c1) (mu2, var2, c2) +{-# INLINE combineStdDevAcc #-} + +-- | Combine variances of two subsets of a sample (that is, exact variance of +-- datasets rather than estimate of variance of population). +combineVariance :: MeanAcc -- ^ Combined mean. + -> (MeanAcc, Variance, KAcc) -- ^ First subset. + -> (MeanAcc, Variance, KAcc) -- ^ Second subset. + -> Variance +combineVariance (MeanAcc muHat) (MeanAcc mu1, Variance var1, KAcc c1) (MeanAcc mu2, Variance var2, KAcc c2) = + let t1 = c1' * (var1 + (mu1 ** two)) + t2 = c2' * (var2 + (mu2 ** two)) in + Variance $ ((t1 + t2) / (c1' + c2')) - (muHat ** two) + where + c1' = fromIntegral $ c1 - 1 + + c2' = fromIntegral $ c2 - 1 + + two = 2.0 :: Double +{-# INLINE combineVariance #-} + +-- | Combine mean of two subsets, given subset means and size. +combineMeanAcc :: (MeanAcc, KAcc) -> (MeanAcc, KAcc) -> MeanAcc +combineMeanAcc (MeanAcc mu1, KAcc c1) (MeanAcc mu2, KAcc c2) = + let c1' = fromIntegral $ c1 - 1 + c2' = fromIntegral $ c2 - 1 in + MeanAcc $ ((mu1 * c1') + (mu2 * c2')) / (c1' + c2') +{-# INLINE combineMeanAcc #-} + +finalizeStdDevAcc :: KAcc -> StdDevAcc -> StdDev +finalizeStdDevAcc ka sda = + stdDevFromVariance $ varianceFromStdDevAcc ka sda +{-# INLINE finalizeStdDevAcc #-} + +varianceFromStdDevAcc :: KAcc -> StdDevAcc -> Variance +varianceFromStdDevAcc (KAcc n) (StdDevAcc sda) = + Variance $ sda / fromIntegral (n - 1) +{-# INLINE varianceFromStdDevAcc #-} + +stdDevAccFromVariance :: KAcc -> Variance -> StdDevAcc +stdDevAccFromVariance (KAcc n) (Variance var) = + StdDevAcc $ var * fromIntegral (n - 1) +{-# INLINE stdDevAccFromVariance #-} + +stdDevFromVariance :: Variance -> StdDev +stdDevFromVariance = StdDev . sqrt . unVariance +{-# INLINE stdDevFromVariance #-} diff --git a/src/Warden/Serial/Json/Numeric.hs b/src/Warden/Serial/Json/Numeric.hs index b4c8bf4..0ccd2f4 100644 --- a/src/Warden/Serial/Json/Numeric.hs +++ b/src/Warden/Serial/Json/Numeric.hs @@ -20,8 +20,8 @@ fromNumericSummary (NumericSummary mn mx mean s md) = object [ "version" .= ("v1" :: Text) , "minimum" .= fromMinimum mn , "maximum" .= fromMaximum mx - , "mean" .= mean - , "stddev" .= s + , "mean" .= fromMean mean + , "stddev" .= fromStdDev s , "median" .= fromMedian md ] @@ -31,8 +31,8 @@ toNumericSummary (Object o) = "v1" -> NumericSummary <$> (toMinimum =<< (o .: "minimum")) <*> (toMaximum =<< (o .: "maximum")) - <*> o .: "mean" - <*> o .: "stddev" + <*> (toMean =<< (o .: "mean")) + <*> (toStdDev =<< (o .: "stddev")) <*> (toMedian =<< (o .: "median")) v -> fail $ "Warden.Data.Numeric.NumericSummary: unknown version [" <> v <> "]" toNumericSummary x = typeMismatch "Warden.Data.Numeric.NumericSummary" x @@ -93,3 +93,41 @@ toMedian (Object o) = do pure $ Median v s -> fail . T.unpack $ "invalid Median type: " <> s toMedian x = typeMismatch "Warden.Data.Numeric.Median" x + +fromMean :: Mean -> Value +fromMean (Mean v) = object [ + "type" .= String "mean" + , "value" .= toJSON v + ] +fromMean NoMean = object [ + "type" .= String "no-mean" + ] + +toMean :: Value -> Parser Mean +toMean (Object o) = do + o .: "type" >>= \case + "no-mean" -> pure NoMean + "mean" -> do + v <- parseJSON =<< (o .: "value") + pure $ Mean v + s -> fail . T.unpack $ "invalid Mean type: " <> s +toMean x = typeMismatch "Warden.Data.Numeric.Mean" x + +fromStdDev :: StdDev -> Value +fromStdDev (StdDev v) = object [ + "type" .= String "stddev" + , "value" .= toJSON v + ] +fromStdDev NoStdDev = object [ + "type" .= String "no-stddev" + ] + +toStdDev :: Value -> Parser StdDev +toStdDev (Object o) = do + o .: "type" >>= \case + "no-stddev" -> pure NoStdDev + "stddev" -> do + v <- parseJSON =<< (o .: "value") + pure $ StdDev v + s -> fail . T.unpack $ "invalid StdDev type: " <> s +toStdDev x = typeMismatch "Warden.Data.Numeric.StdDev" x diff --git a/test/Test/Warden/Arbitrary.hs b/test/Test/Warden/Arbitrary.hs index 659fe03..bf6dd3e 100644 --- a/test/Test/Warden/Arbitrary.hs +++ b/test/Test/Warden/Arbitrary.hs @@ -44,11 +44,25 @@ import Warden.Data import Warden.Sampling.Reservoir instance AEq Mean where + NoMean === NoMean = True + NoMean === _ = False + _ === NoMean = False (Mean x) === (Mean y) = x === y + + NoMean ~== NoMean = True + NoMean ~== _ = False + _ ~== NoMean = False (Mean x) ~== (Mean y) = x ~== y instance AEq StdDev where + NoStdDev === NoStdDev = True + NoStdDev === _ = False + _ === NoStdDev = False (StdDev x) === (StdDev y) = x === y + + NoStdDev ~== NoStdDev = True + NoStdDev ~== _ = False + _ ~== NoStdDev = False (StdDev x) ~== (StdDev y) = x ~== y newtype ValidRow = @@ -551,3 +565,34 @@ instance Arbitrary ExitType where instance Arbitrary IncludeDotFiles where arbitrary = elements [IncludeDotFiles, NoIncludeDotFiles] + +smallPositiveEven :: Gen Int +smallPositiveEven = fmap (* 2) (choose (1, 20)) + +instance AEq MeanAcc where + (===) = (==) + + (MeanAcc x) ~== (MeanAcc y) = x ~== y + +instance AEq StdDevAcc where + (===) = (==) + + (StdDevAcc x) ~== (StdDevAcc y) = x ~== y + +instance AEq MeanDevAcc where + (===) = (==) + + MeanDevInitial ~== MeanDevInitial = True + MeanDevInitial ~== _ = False + _ ~== MeanDevInitial = False + (MeanDevAcc mu1 s21 n1) ~== (MeanDevAcc mu2 s22 n2) = and [ + mu1 ~== mu2 + , s21 ~== s22 + , n1 == n2 + ] + +instance Arbitrary KAcc where + arbitrary = fmap (KAcc . unNPlus) arbitrary + +instance Arbitrary StdDevAcc where + arbitrary = fmap StdDevAcc (choose (0.0, 10000.0)) diff --git a/test/Test/Warden/Numeric.hs b/test/Test/Warden/Numeric.hs index b487a6c..38a26fa 100644 --- a/test/Test/Warden/Numeric.hs +++ b/test/Test/Warden/Numeric.hs @@ -5,6 +5,7 @@ module Test.Warden.Numeric where import Data.AEq (AEq) +import Data.List (take) import Disorder.Core.Property ((~~~)) @@ -20,6 +21,28 @@ import Test.Warden.Arbitrary import Warden.Data import Warden.Numeric +textbookMean :: (Num a, Fractional a) => [a] -> a +textbookMean xs = + (sum xs) / (fromIntegral $ length xs) + +textbookVariance :: (Num a, Fractional a) => a -> [a] -> a +textbookVariance mu xs = + (foldr (\v acc -> acc + ((v - mu) ^ two)) 0.0 xs) / fromIntegral (length xs) + where + two :: Int + two = 2 + +associativity :: (Show c, AEq c) + => (a -> b -> a) -> a -> [b] -> (a -> c) -> Property +associativity f y0 xs g = + left' ~~~ right' + where + left' = g $ foldl f y0 xs + + right' = g $ foldr f' y0 xs + + f' a b = f b a + prop_updateMinimum_positive :: Minimum -> Property prop_updateMinimum_positive NoMinimum = forAll (arbitrary :: Gen Double) $ \x -> (updateMinimum NoMinimum x) === (Minimum x) @@ -60,31 +83,36 @@ prop_updateMaximum_associative n = forAll (vectorOf n (arbitrary :: Gen Double)) prop_updateMeanDev :: NPlus -> Property prop_updateMeanDev (NPlus n) = forAll (vectorOf n (arbitrary :: Gen Double)) $ \xs -> - let nsMeanDev = finalizeMeanDev $ foldl updateMeanDev MeanDevInitial xs - - mu = (sum xs) / (fromIntegral n) - var = foldr (\v acc -> acc + ((v - mu) ^ two)) 0.0 xs + let mda = foldl updateMeanDev MeanDevInitial xs + nsMeanDev = finalizeMeanDev mda + mu = textbookMean xs + var = textbookVariance mu xs sd = StdDev $ sqrt var - uMeanDev = Just (Mean mu, sd) - in nsMeanDev ~~~ uMeanDev + uMeanDev = (Mean mu, sd) in + (nsMeanDev, Just (n+1)) ~~~ (uMeanDev, meanDevKAcc mda) where - two :: Integer - two = 2 + meanDevKAcc MeanDevInitial = Nothing + meanDevKAcc (MeanDevAcc _ _ (KAcc c)) = Just c prop_updateMeanDev_associative :: Int -> Property prop_updateMeanDev_associative n = forAll (vectorOf n (arbitrary :: Gen Double)) $ \xs -> associativity updateMeanDev MeanDevInitial xs finalizeMeanDev -associativity :: (Show c, AEq c) - => (a -> b -> a) -> a -> [b] -> (a -> c) -> Property -associativity f y0 xs g = - left' ~~~ right' - where - left' = g $ foldl f y0 xs - - right' = g $ foldr f' y0 xs - - f' a b = f b a +prop_combineMeanDevAcc :: Property +prop_combineMeanDevAcc = forAll smallPositiveEven $ \n -> forAll (vectorOf n (arbitrary :: Gen Double)) $ \xs -> + let m = n `div` 2 + mda = foldl updateMeanDev MeanDevInitial xs + xs1 = take m xs + xs2 = drop m xs + mda1 = foldl updateMeanDev MeanDevInitial xs1 + mda2 = foldl updateMeanDev MeanDevInitial xs2 + mda' = combineMeanDevAcc mda1 mda2 in + mda ~~~ mda' + +prop_tripping_StdDevAcc :: KAcc -> StdDevAcc -> Property +prop_tripping_StdDevAcc ka sda = + let sda' = stdDevAccFromVariance ka $ varianceFromStdDevAcc ka sda in + sda ~~~ sda' return [] tests :: IO Bool