Skip to content

Commit

Permalink
Merge pull request #99 from ambiata/topic/numbers
Browse files Browse the repository at this point in the history
Numeric fixes + combinators
  • Loading branch information
olorin committed Apr 5, 2016
2 parents b5250db + 2921657 commit 4335367
Show file tree
Hide file tree
Showing 7 changed files with 378 additions and 73 deletions.
12 changes: 12 additions & 0 deletions bench/bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import Test.QuickCheck (vectorOf, arbitrary)
import Test.Warden.Arbitrary

import Warden.Data
import Warden.Numeric
import Warden.Row
import Warden.View

Expand Down Expand Up @@ -55,6 +56,10 @@ prepareHashText :: IO [ByteString]
prepareHashText =
generate' (Deterministic 54321) (GenSize 30) $ vectorOf 1000 arbitrary

prepareNumbers :: IO [Double]
prepareNumbers =
generate' (Deterministic 2468) (GenSize 100) $ vectorOf 10000 arbitrary

prepareFolds :: IO ([Row], [ByteString])
prepareFolds = (,) <$> prepareSVParse <*> prepareHashText

Expand All @@ -80,6 +85,9 @@ benchHashText = fmap hashText
benchUpdateTextCounts :: [Row] -> TextCounts
benchUpdateTextCounts rs = foldl' (flip (updateTextCounts (TextFreeformThreshold 100))) NoTextCounts rs

benchUpdateNumericState :: [Double] -> NumericState
benchUpdateNumericState ns = foldl' updateNumericState initialNumericState ns

main :: IO ()
main = do
withTempDirectory "." "warden-bench-" $ \root ->
Expand All @@ -98,4 +106,8 @@ main = do
, bench "hashText/1000" $ nf benchHashText ts
, bench "updateTextCounts/1000" $ nf benchUpdateTextCounts rs
]
, env prepareNumbers $ \ ~(ns) ->
bgroup "numerics" $ [
bench "updateNumericState/10000" $ nf benchUpdateNumericState ns
]
]
136 changes: 105 additions & 31 deletions src/Warden/Data/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,41 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE DeriveGeneric #-}

module Warden.Data.Numeric (
Minimum(..)
KAcc(..)
, Maximum(..)
, Mean(..)
, Count(..)
, MeanAcc(..)
, MeanDevAcc(..)
, Median(..)
, StdDev(..)
, Minimum(..)
, NumericState(..)
, NumericSummary(..)
, StdDev(..)
, StdDevAcc(..)
, Variance(..)
, fromVariance
, initialNumericState
, mkStdDev
, stateMaximum
, stateMeanDev
, stateMinimum
) where

import Data.Aeson
import Data.Aeson.Types
import Control.Lens (makeLenses)

import GHC.Generics (Generic)

import P

data Minimum =
Minimum {-# UNPACK #-} !Double
| NoMinimum
deriving (Eq, Show)
deriving (Eq, Show, Generic)

instance NFData Minimum

instance Monoid Minimum where
mempty = NoMinimum
Expand All @@ -41,7 +53,9 @@ instance Monoid Minimum where
data Maximum =
Maximum {-# UNPACK #-} !Double
| NoMaximum
deriving (Eq, Show)
deriving (Eq, Show, Generic)

instance NFData Maximum

instance Monoid Maximum where
mempty = NoMaximum
Expand All @@ -55,42 +69,102 @@ instance Monoid Maximum where
else Maximum prev
{-# INLINE mappend #-}

newtype Count = Count { getCount :: Int }
deriving (Eq, Show, ToJSON, FromJSON)
-- | Counter param for mean/stddev calculation. Equal to one plus the number
-- of records seen.
newtype KAcc =
KAcc {
getKAcc :: Int
} deriving (Eq, Show, Generic, Num)

instance NFData KAcc

newtype Mean = Mean { getMean :: Double }
deriving (Eq, Show, ToJSON, FromJSON)
-- | Preliminary mean, still accumulating.
newtype MeanAcc =
MeanAcc {
unMeanAcc :: Double
} deriving (Eq, Show, Generic)

instance NFData MeanAcc

-- | Final mean.
data Mean =
NoMean
| Mean {-# UNPACK #-} !Double
deriving (Eq, Show, Generic)

instance NFData Mean

data Median =
Median {-# UNPACK #-} !Double
| NoMedian
deriving (Eq, Show)
deriving (Eq, Show, Generic)

newtype Variance = Variance { getVariance :: Double }
deriving (Eq, Show, ToJSON, FromJSON)
instance NFData Median

fromVariance :: Variance -> StdDev
fromVariance = StdDev . sqrt . getVariance
-- | Accumulator for standard deviation calculation. Closer to variance than
-- standard deviation to avoid repeated square roots.
--
-- \( acc = \sigma^{2} (k - 1) \)
--
-- Where `acc` is 'StdDevAcc' and `k` is the 'KAcc'.
newtype StdDevAcc =
StdDevAcc {
unStdDevAcc :: Double
} deriving (Eq, Show, Generic)

newtype StdDev = StdDev { getStdDev :: Double }
deriving (Eq, Show, ToJSON)
instance NFData StdDevAcc

mkStdDev :: Double -> Maybe StdDev
mkStdDev v
| v < 0.0 = Nothing
| otherwise = Just $ StdDev v
newtype Variance =
Variance {
unVariance :: Double
} deriving (Eq, Show, Generic)

instance NFData Variance

instance FromJSON StdDev where
parseJSON (Number v) = case mkStdDev ((fromRational . toRational) v) of
Nothing -> fail "StdDev must not be negative"
Just v' -> pure v'
parseJSON x = typeMismatch "StdDev" x
data StdDev =
NoStdDev
| StdDev {-# UNPACK #-} !Double
deriving (Eq, Show, Generic)

instance NFData StdDev

mkStdDev :: Double -> StdDev
mkStdDev v
| v < 0.0 = NoStdDev
| otherwise = StdDev v

-- | So we can cheaply keep track of long-term change in numeric datasets.
-- Will probably also end up in brandix.
data NumericSummary = NumericSummary !Minimum
!Maximum
{-# UNPACK #-} !Mean
{-# UNPACK #-} !StdDev
!Mean
!StdDev
!Median
deriving (Eq, Show)
deriving (Eq, Show, Generic)

instance NFData NumericSummary

data MeanDevAcc =
MeanDevInitial
| MeanDevAcc {-# UNPACK #-} !MeanAcc !(Maybe StdDevAcc) {-# UNPACK #-} !KAcc
deriving (Eq, Show, Generic)

instance NFData MeanDevAcc

data NumericState =
NumericState {
_stateMinimum :: !Minimum
, _stateMaximum :: !Maximum
, _stateMeanDev :: !MeanDevAcc
} deriving (Eq, Show, Generic)

instance NFData NumericState

makeLenses ''NumericState

initialNumericState :: NumericState
initialNumericState =
NumericState
NoMinimum
NoMaximum
MeanDevInitial
2 changes: 1 addition & 1 deletion src/Warden/Data/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ module Warden.Data.Row (
, totalRows
) where

import Control.Lens
import Control.Lens (makeLenses, (^.), (%~))

import Data.ByteString (ByteString)
import Data.Char (chr, ord)
Expand Down
Loading

0 comments on commit 4335367

Please sign in to comment.