Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numeric fixes + combinators #99

Merged
merged 12 commits into from
Apr 5, 2016
12 changes: 12 additions & 0 deletions bench/bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import Test.QuickCheck (vectorOf, arbitrary)
import Test.Warden.Arbitrary

import Warden.Data
import Warden.Numeric
import Warden.Row
import Warden.View

Expand Down Expand Up @@ -55,6 +56,10 @@ prepareHashText :: IO [ByteString]
prepareHashText =
generate' (Deterministic 54321) (GenSize 30) $ vectorOf 1000 arbitrary

prepareNumbers :: IO [Double]
prepareNumbers =
generate' (Deterministic 2468) (GenSize 100) $ vectorOf 10000 arbitrary

prepareFolds :: IO ([Row], [ByteString])
prepareFolds = (,) <$> prepareSVParse <*> prepareHashText

Expand All @@ -80,6 +85,9 @@ benchHashText = fmap hashText
benchUpdateTextCounts :: [Row] -> TextCounts
benchUpdateTextCounts rs = foldl' (flip (updateTextCounts (TextFreeformThreshold 100))) NoTextCounts rs

benchUpdateNumericState :: [Double] -> NumericState
benchUpdateNumericState ns = foldl' updateNumericState initialNumericState ns

main :: IO ()
main = do
withTempDirectory "." "warden-bench-" $ \root ->
Expand All @@ -98,4 +106,8 @@ main = do
, bench "hashText/1000" $ nf benchHashText ts
, bench "updateTextCounts/1000" $ nf benchUpdateTextCounts rs
]
, env prepareNumbers $ \ ~(ns) ->
bgroup "numerics" $ [
bench "updateNumericState/10000" $ nf benchUpdateNumericState ns
]
]
136 changes: 105 additions & 31 deletions src/Warden/Data/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,41 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE DeriveGeneric #-}

module Warden.Data.Numeric (
Minimum(..)
KAcc(..)
, Maximum(..)
, Mean(..)
, Count(..)
, MeanAcc(..)
, MeanDevAcc(..)
, Median(..)
, StdDev(..)
, Minimum(..)
, NumericState(..)
, NumericSummary(..)
, StdDev(..)
, StdDevAcc(..)
, Variance(..)
, fromVariance
, initialNumericState
, mkStdDev
, stateMaximum
, stateMeanDev
, stateMinimum
) where

import Data.Aeson
import Data.Aeson.Types
import Control.Lens (makeLenses)

import GHC.Generics (Generic)

import P

data Minimum =
Minimum {-# UNPACK #-} !Double
| NoMinimum
deriving (Eq, Show)
deriving (Eq, Show, Generic)

instance NFData Minimum

instance Monoid Minimum where
mempty = NoMinimum
Expand All @@ -41,7 +53,9 @@ instance Monoid Minimum where
data Maximum =
Maximum {-# UNPACK #-} !Double
| NoMaximum
deriving (Eq, Show)
deriving (Eq, Show, Generic)

instance NFData Maximum

instance Monoid Maximum where
mempty = NoMaximum
Expand All @@ -55,42 +69,102 @@ instance Monoid Maximum where
else Maximum prev
{-# INLINE mappend #-}

newtype Count = Count { getCount :: Int }
deriving (Eq, Show, ToJSON, FromJSON)
-- | Counter param for mean/stddev calculation. Equal to one plus the number
-- of records seen.
newtype KAcc =
KAcc {
getKAcc :: Int
} deriving (Eq, Show, Generic, Num)

instance NFData KAcc

newtype Mean = Mean { getMean :: Double }
deriving (Eq, Show, ToJSON, FromJSON)
-- | Preliminary mean, still accumulating.
newtype MeanAcc =
MeanAcc {
unMeanAcc :: Double
} deriving (Eq, Show, Generic)

instance NFData MeanAcc

-- | Final mean.
data Mean =
NoMean
| Mean {-# UNPACK #-} !Double
deriving (Eq, Show, Generic)

instance NFData Mean

data Median =
Median {-# UNPACK #-} !Double
| NoMedian
deriving (Eq, Show)
deriving (Eq, Show, Generic)

newtype Variance = Variance { getVariance :: Double }
deriving (Eq, Show, ToJSON, FromJSON)
instance NFData Median

fromVariance :: Variance -> StdDev
fromVariance = StdDev . sqrt . getVariance
-- | Accumulator for standard deviation calculation. Closer to variance than
-- standard deviation to avoid repeated square roots.
--
-- \( acc = \sigma^{2} (k - 1) \)
--
-- Where `acc` is 'StdDevAcc' and `k` is the 'KAcc'.
newtype StdDevAcc =
StdDevAcc {
unStdDevAcc :: Double
} deriving (Eq, Show, Generic)

newtype StdDev = StdDev { getStdDev :: Double }
deriving (Eq, Show, ToJSON)
instance NFData StdDevAcc

mkStdDev :: Double -> Maybe StdDev
mkStdDev v
| v < 0.0 = Nothing
| otherwise = Just $ StdDev v
newtype Variance =
Variance {
unVariance :: Double
} deriving (Eq, Show, Generic)

instance NFData Variance

instance FromJSON StdDev where
parseJSON (Number v) = case mkStdDev ((fromRational . toRational) v) of
Nothing -> fail "StdDev must not be negative"
Just v' -> pure v'
parseJSON x = typeMismatch "StdDev" x
data StdDev =
NoStdDev
| StdDev {-# UNPACK #-} !Double
deriving (Eq, Show, Generic)

instance NFData StdDev

mkStdDev :: Double -> StdDev
mkStdDev v
| v < 0.0 = NoStdDev
| otherwise = StdDev v

-- | So we can cheaply keep track of long-term change in numeric datasets.
-- Will probably also end up in brandix.
data NumericSummary = NumericSummary !Minimum
!Maximum
{-# UNPACK #-} !Mean
{-# UNPACK #-} !StdDev
!Mean
!StdDev
!Median
deriving (Eq, Show)
deriving (Eq, Show, Generic)

instance NFData NumericSummary

data MeanDevAcc =
MeanDevInitial
| MeanDevAcc {-# UNPACK #-} !MeanAcc !(Maybe StdDevAcc) {-# UNPACK #-} !KAcc
deriving (Eq, Show, Generic)

instance NFData MeanDevAcc

data NumericState =
NumericState {
_stateMinimum :: !Minimum
, _stateMaximum :: !Maximum
, _stateMeanDev :: !MeanDevAcc
} deriving (Eq, Show, Generic)

instance NFData NumericState

makeLenses ''NumericState

initialNumericState :: NumericState
initialNumericState =
NumericState
NoMinimum
NoMaximum
MeanDevInitial
2 changes: 1 addition & 1 deletion src/Warden/Data/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ module Warden.Data.Row (
, totalRows
) where

import Control.Lens
import Control.Lens (makeLenses, (^.), (%~))

import Data.ByteString (ByteString)
import Data.Char (chr, ord)
Expand Down
Loading