Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor count summary for flexibility + more tests #100

Merged
merged 8 commits into from
Apr 5, 2016
1 change: 1 addition & 0 deletions ambiata-warden.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ library
, either == 4.3.*
, filepath == 1.3.*
, foldl == 1.1.*
, ieee754 == 0.7.*
, lens == 4.9.*
, lifted-async == 0.5.*
, resourcet == 1.1.*
Expand Down
18 changes: 17 additions & 1 deletion bench/bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ prepareNumbers =
prepareFolds :: IO ([Row], [ByteString])
prepareFolds = (,) <$> prepareSVParse <*> prepareHashText

prepareMeanDevAccs :: IO [MeanDevAcc]
prepareMeanDevAccs =
generate' (Deterministic 8642) (GenSize 100) $ vectorOf 10000 arbitrary

prepareNumericStates :: IO [NumericState]
prepareNumericStates =
generate' (Deterministic 9876) (GenSize 100) $ vectorOf 10000 arbitrary

benchABDecode :: NonEmpty ViewFile -> IO ()
benchABDecode vfs =
let sep = Separator . fromIntegral $ ord '|'
Expand All @@ -88,6 +96,12 @@ benchUpdateTextCounts rs = foldl' (flip (updateTextCounts (TextFreeformThreshold
benchUpdateNumericState :: [Double] -> NumericState
benchUpdateNumericState ns = foldl' updateNumericState initialNumericState ns

benchCombineMeanDevAcc :: [MeanDevAcc] -> MeanDevAcc
benchCombineMeanDevAcc mdas = foldl' combineMeanDevAcc MeanDevInitial mdas

benchCombineNumericState :: [NumericState] -> NumericState
benchCombineNumericState nss = foldl' combineNumericState initialNumericState nss

main :: IO ()
main = do
withTempDirectory "." "warden-bench-" $ \root ->
Expand All @@ -106,8 +120,10 @@ main = do
, bench "hashText/1000" $ nf benchHashText ts
, bench "updateTextCounts/1000" $ nf benchUpdateTextCounts rs
]
, env prepareNumbers $ \ ~(ns) ->
, env ((,,) <$> prepareNumbers <*> prepareMeanDevAccs <*> prepareNumericStates) $ \ ~(ns, mdas, nss) ->
bgroup "numerics" $ [
bench "updateNumericState/10000" $ nf benchUpdateNumericState ns
, bench "combineMeanDevAcc/10000" $ nf benchCombineMeanDevAcc mdas
, bench "combineNumericState/10000" $ nf benchCombineNumericState nss
]
]
5 changes: 3 additions & 2 deletions src/Warden/Check/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,9 @@ finalizeSVParseState ps sch ds vfs sv =
, checkTotalRows (sv ^. totalRows)
, checkBadRows (sv ^. badRows)
]
vfs' = S.fromList $ NE.toList vfs in
(st, ViewMetadata sv ps ds vfs')
vfs' = S.fromList $ NE.toList vfs
rcs = summarizeSVParseState sv in
(st, ViewMetadata rcs ps ds vfs')

checkTotalRows :: RowCount -> CheckStatus
checkTotalRows (RowCount n)
Expand Down
25 changes: 24 additions & 1 deletion src/Warden/Data/Marker.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ module Warden.Data.Marker (
, MarkerFailure(..)
, MarkerStatus(..)
, MarkerVersion(..)
, RowCountSummary(..)
, ViewMarker(..)
, ViewMetadata(..)
, currentMarkerVersion
Expand All @@ -18,9 +19,12 @@ module Warden.Data.Marker (
, markerToFile
, mkFileMarker
, mkViewMarker
, summarizeSVParseState
, viewMarkerPath
) where

import Control.Lens ((^.))

import Data.Attoparsec.Text (IResult(..), Parser, parse)
import Data.Attoparsec.Text (string, satisfy, manyTill')
import Data.Char (ord)
Expand All @@ -40,6 +44,7 @@ import P
import Warden.Data.Check
import Warden.Data.Param
import Warden.Data.Row
import Warden.Data.TextCounts
import Warden.Data.View

data MarkerVersion =
Expand Down Expand Up @@ -199,9 +204,27 @@ mkViewMarker wps v dsc dt vm cs =
let crs = [summarizeResult RowResult dsc cs] in
ViewMarker currentMarkerVersion wps v dt crs vm

data RowCountSummary =
RowCountSummary {
rcsBadRows :: !RowCount
, rcsTotalRows :: !RowCount
, rcsNumFields :: !(Set FieldCount)
, rcsFieldLooks :: !FieldLookCount
, rcsTextCounts :: !TextCounts
} deriving (Eq, Show)

summarizeSVParseState :: SVParseState -> RowCountSummary
summarizeSVParseState ps =
RowCountSummary
(ps ^. badRows)
(ps ^. totalRows)
(ps ^. numFields)
(ps ^. fieldLooks)
(ps ^. textCounts)

data ViewMetadata =
ViewMetadata {
vmViewCounts :: !SVParseState
vmViewCounts :: !RowCountSummary
, vmCheckParams :: !CheckParams
, vmDates :: !(Set Date)
, vmViewFiles :: !(Set ViewFile)
Expand Down
63 changes: 63 additions & 0 deletions src/Warden/Data/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ module Warden.Data.Numeric (

import Control.Lens (makeLenses)

import Data.AEq (AEq, (===), (~==))

import GHC.Generics (Generic)

import P
Expand All @@ -38,6 +40,10 @@ data Minimum =

instance NFData Minimum

instance AEq Minimum where
(===) = (==)
(~==) = (==)

instance Monoid Minimum where
mempty = NoMinimum
mappend x y = mcompare x y
Expand All @@ -57,6 +63,10 @@ data Maximum =

instance NFData Maximum

instance AEq Maximum where
(===) = (==)
(~==) = (==)

instance Monoid Maximum where
mempty = NoMaximum
mappend x y = mcompare x y
Expand Down Expand Up @@ -86,6 +96,11 @@ newtype MeanAcc =

instance NFData MeanAcc

instance AEq MeanAcc where
(===) = (==)

(MeanAcc x) ~== (MeanAcc y) = x ~== y

-- | Final mean.
data Mean =
NoMean
Expand All @@ -94,6 +109,17 @@ data Mean =

instance NFData Mean

instance AEq Mean where
NoMean === NoMean = True
NoMean === _ = False
_ === NoMean = False
(Mean x) === (Mean y) = x === y

NoMean ~== NoMean = True
NoMean ~== _ = False
_ ~== NoMean = False
(Mean x) ~== (Mean y) = x ~== y

data Median =
Median {-# UNPACK #-} !Double
| NoMedian
Expand All @@ -114,6 +140,11 @@ newtype StdDevAcc =

instance NFData StdDevAcc

instance AEq StdDevAcc where
(===) = (==)

(StdDevAcc x) ~== (StdDevAcc y) = x ~== y

newtype Variance =
Variance {
unVariance :: Double
Expand All @@ -128,6 +159,17 @@ data StdDev =

instance NFData StdDev

instance AEq StdDev where
NoStdDev === NoStdDev = True
NoStdDev === _ = False
_ === NoStdDev = False
(StdDev x) === (StdDev y) = x === y

NoStdDev ~== NoStdDev = True
NoStdDev ~== _ = False
_ ~== NoStdDev = False
(StdDev x) ~== (StdDev y) = x ~== y

mkStdDev :: Double -> StdDev
mkStdDev v
| v < 0.0 = NoStdDev
Expand All @@ -151,6 +193,18 @@ data MeanDevAcc =

instance NFData MeanDevAcc

instance AEq MeanDevAcc where
(===) = (==)

MeanDevInitial ~== MeanDevInitial = True
MeanDevInitial ~== _ = False
_ ~== MeanDevInitial = False
(MeanDevAcc mu1 s21 n1) ~== (MeanDevAcc mu2 s22 n2) = and [
mu1 ~== mu2
, s21 ~== s22
, n1 == n2
]

data NumericState =
NumericState {
_stateMinimum :: !Minimum
Expand All @@ -162,6 +216,15 @@ instance NFData NumericState

makeLenses ''NumericState

instance AEq NumericState where
(===) = (==)

(NumericState mn1 mx1 mda1) ~== (NumericState mn2 mx2 mda2) = and [
mn1 ~== mn2
, mx1 ~== mx2
, mda1 ~== mda2
]

initialNumericState :: NumericState
initialNumericState =
NumericState
Expand Down
10 changes: 4 additions & 6 deletions src/Warden/Inference.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ module Warden.Inference (
, viewMarkerMismatch
) where

import Control.Lens ((^.), view)

import Data.List (zip)
import Data.List.NonEmpty (NonEmpty(..), nonEmpty)
import qualified Data.List.NonEmpty as NE
Expand Down Expand Up @@ -58,7 +56,7 @@ viewMarkerMismatch a b = do
then pure ()
else Left $ ViewMarkerMismatch ctx (T.pack $ show x) (T.pack $ show y)

fields' vm' = (vmViewCounts $ vmMetadata vm') ^. numFields
fields' = rcsNumFields . vmViewCounts . vmMetadata

fft' = checkFreeformThreshold . vmCheckParams . vmMetadata

Expand All @@ -82,14 +80,14 @@ validateViewMarkers (m:|ms) = go m ms >> checkFails
fieldLookSum :: ValidViewMarkers -> FieldLookCount
fieldLookSum =
foldl' combineFieldLooks NoFieldLookCount .
fmap (view fieldLooks . vmViewCounts . vmMetadata) . unValidViewMarkers
fmap (rcsFieldLooks . vmViewCounts . vmMetadata) . unValidViewMarkers

textCountSum :: ValidViewMarkers -> TextCounts
textCountSum (ValidViewMarkers vms) =
-- FFTs already validated as the same
let fft = checkFreeformThreshold . vmCheckParams . vmMetadata $ NE.head vms in
foldl' (combineTextCounts fft) NoTextCounts $
fmap (view textCounts . vmViewCounts . vmMetadata) vms
fmap (rcsTextCounts . vmViewCounts . vmMetadata) vms

inferForms :: ValidViewMarkers -> Either InferenceError TextCountSummary
inferForms vms =
Expand Down Expand Up @@ -135,7 +133,7 @@ countCompatibleFields vms =
Right $ V.map countsForField fls

totalViewRows :: ValidViewMarkers -> RowCount
totalViewRows = sum . fmap (view totalRows . vmViewCounts . vmMetadata) . unValidViewMarkers
totalViewRows = sum . fmap (rcsTotalRows . vmViewCounts . vmMetadata) . unValidViewMarkers

normalizeFieldHistogram :: RowCount -> FieldHistogram -> Either InferenceError (VU.Vector NormalizedEntries)
normalizeFieldHistogram (RowCount rc) (FieldHistogram cs) = do
Expand Down
12 changes: 11 additions & 1 deletion src/Warden/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
module Warden.Numeric (
combineMeanAcc
, combineMeanDevAcc
, combineNumericState
, combineStdDevAcc
, finalizeMeanDev
, finalizeStdDevAcc
Expand Down Expand Up @@ -36,7 +37,7 @@ updateMaximum !acc x =
in acc <> x'
{-# INLINE updateMaximum #-}

-- Minimal-error mean and variance.
-- | Minimal-error mean and standard deviation.
--
-- From Knuth (TAoCP v2, Seminumerical Algorithms, p232).
--
Expand Down Expand Up @@ -169,3 +170,12 @@ stdDevAccFromVariance (KAcc n) (Variance var) =
stdDevFromVariance :: Variance -> StdDev
stdDevFromVariance = StdDev . sqrt . unVariance
{-# INLINE stdDevFromVariance #-}

-- FIXME: not associative
combineNumericState :: NumericState -> NumericState -> NumericState
combineNumericState ns1 ns2 =
(stateMinimum %~ (<> (ns1 ^. stateMinimum)))
. (stateMaximum %~ (<> (ns1 ^. stateMaximum)))
. (stateMeanDev %~ (combineMeanDevAcc (ns1 ^. stateMeanDev)))
$!! ns2
{-# INLINE combineNumericState #-}
5 changes: 5 additions & 0 deletions src/Warden/Sampling/Reservoir.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ module Warden.Sampling.Reservoir (
, xQuantile
) where

import Data.AEq (AEq, (===), (~==))
import Data.Vector.Unboxed (Vector)

import P
Expand All @@ -47,6 +48,10 @@ newtype Probability =
unProbability :: Double
} deriving (Eq, Show)

instance AEq Probability where
(Probability p) === (Probability q) = p === q
(Probability p) ~== (Probability q) = p ~== q

probability :: Double -> Maybe Probability
probability p
| p < 0 || p > 1 = Nothing
Expand Down
Loading