Skip to content

Commit

Permalink
Merge pull request #100 from ambiata/topic/count-summary
Browse files Browse the repository at this point in the history
Refactor count summary for flexibility + more tests
  • Loading branch information
olorin committed Apr 5, 2016
2 parents 0a5db38 + f106094 commit 94216ad
Show file tree
Hide file tree
Showing 15 changed files with 238 additions and 119 deletions.
1 change: 1 addition & 0 deletions ambiata-warden.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ library
, either == 4.3.*
, filepath == 1.3.*
, foldl == 1.1.*
, ieee754 == 0.7.*
, lens == 4.9.*
, lifted-async == 0.5.*
, resourcet == 1.1.*
Expand Down
18 changes: 17 additions & 1 deletion bench/bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ prepareNumbers =
prepareFolds :: IO ([Row], [ByteString])
prepareFolds = (,) <$> prepareSVParse <*> prepareHashText

prepareMeanDevAccs :: IO [MeanDevAcc]
prepareMeanDevAccs =
generate' (Deterministic 8642) (GenSize 100) $ vectorOf 10000 arbitrary

prepareNumericStates :: IO [NumericState]
prepareNumericStates =
generate' (Deterministic 9876) (GenSize 100) $ vectorOf 10000 arbitrary

benchABDecode :: NonEmpty ViewFile -> IO ()
benchABDecode vfs =
let sep = Separator . fromIntegral $ ord '|'
Expand All @@ -88,6 +96,12 @@ benchUpdateTextCounts rs = foldl' (flip (updateTextCounts (TextFreeformThreshold
benchUpdateNumericState :: [Double] -> NumericState
benchUpdateNumericState ns = foldl' updateNumericState initialNumericState ns

benchCombineMeanDevAcc :: [MeanDevAcc] -> MeanDevAcc
benchCombineMeanDevAcc mdas = foldl' combineMeanDevAcc MeanDevInitial mdas

benchCombineNumericState :: [NumericState] -> NumericState
benchCombineNumericState nss = foldl' combineNumericState initialNumericState nss

main :: IO ()
main = do
withTempDirectory "." "warden-bench-" $ \root ->
Expand All @@ -106,8 +120,10 @@ main = do
, bench "hashText/1000" $ nf benchHashText ts
, bench "updateTextCounts/1000" $ nf benchUpdateTextCounts rs
]
, env prepareNumbers $ \ ~(ns) ->
, env ((,,) <$> prepareNumbers <*> prepareMeanDevAccs <*> prepareNumericStates) $ \ ~(ns, mdas, nss) ->
bgroup "numerics" $ [
bench "updateNumericState/10000" $ nf benchUpdateNumericState ns
, bench "combineMeanDevAcc/10000" $ nf benchCombineMeanDevAcc mdas
, bench "combineNumericState/10000" $ nf benchCombineNumericState nss
]
]
5 changes: 3 additions & 2 deletions src/Warden/Check/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,9 @@ finalizeSVParseState ps sch ds vfs sv =
, checkTotalRows (sv ^. totalRows)
, checkBadRows (sv ^. badRows)
]
vfs' = S.fromList $ NE.toList vfs in
(st, ViewMetadata sv ps ds vfs')
vfs' = S.fromList $ NE.toList vfs
rcs = summarizeSVParseState sv in
(st, ViewMetadata rcs ps ds vfs')

checkTotalRows :: RowCount -> CheckStatus
checkTotalRows (RowCount n)
Expand Down
25 changes: 24 additions & 1 deletion src/Warden/Data/Marker.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ module Warden.Data.Marker (
, MarkerFailure(..)
, MarkerStatus(..)
, MarkerVersion(..)
, RowCountSummary(..)
, ViewMarker(..)
, ViewMetadata(..)
, currentMarkerVersion
Expand All @@ -18,9 +19,12 @@ module Warden.Data.Marker (
, markerToFile
, mkFileMarker
, mkViewMarker
, summarizeSVParseState
, viewMarkerPath
) where

import Control.Lens ((^.))

import Data.Attoparsec.Text (IResult(..), Parser, parse)
import Data.Attoparsec.Text (string, satisfy, manyTill')
import Data.Char (ord)
Expand All @@ -40,6 +44,7 @@ import P
import Warden.Data.Check
import Warden.Data.Param
import Warden.Data.Row
import Warden.Data.TextCounts
import Warden.Data.View

data MarkerVersion =
Expand Down Expand Up @@ -199,9 +204,27 @@ mkViewMarker wps v dsc dt vm cs =
let crs = [summarizeResult RowResult dsc cs] in
ViewMarker currentMarkerVersion wps v dt crs vm

data RowCountSummary =
RowCountSummary {
rcsBadRows :: !RowCount
, rcsTotalRows :: !RowCount
, rcsNumFields :: !(Set FieldCount)
, rcsFieldLooks :: !FieldLookCount
, rcsTextCounts :: !TextCounts
} deriving (Eq, Show)

summarizeSVParseState :: SVParseState -> RowCountSummary
summarizeSVParseState ps =
RowCountSummary
(ps ^. badRows)
(ps ^. totalRows)
(ps ^. numFields)
(ps ^. fieldLooks)
(ps ^. textCounts)

data ViewMetadata =
ViewMetadata {
vmViewCounts :: !SVParseState
vmViewCounts :: !RowCountSummary
, vmCheckParams :: !CheckParams
, vmDates :: !(Set Date)
, vmViewFiles :: !(Set ViewFile)
Expand Down
63 changes: 63 additions & 0 deletions src/Warden/Data/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ module Warden.Data.Numeric (

import Control.Lens (makeLenses)

import Data.AEq (AEq, (===), (~==))

import GHC.Generics (Generic)

import P
Expand All @@ -38,6 +40,10 @@ data Minimum =

instance NFData Minimum

instance AEq Minimum where
(===) = (==)
(~==) = (==)

instance Monoid Minimum where
mempty = NoMinimum
mappend x y = mcompare x y
Expand All @@ -57,6 +63,10 @@ data Maximum =

instance NFData Maximum

instance AEq Maximum where
(===) = (==)
(~==) = (==)

instance Monoid Maximum where
mempty = NoMaximum
mappend x y = mcompare x y
Expand Down Expand Up @@ -86,6 +96,11 @@ newtype MeanAcc =

instance NFData MeanAcc

instance AEq MeanAcc where
(===) = (==)

(MeanAcc x) ~== (MeanAcc y) = x ~== y

-- | Final mean.
data Mean =
NoMean
Expand All @@ -94,6 +109,17 @@ data Mean =

instance NFData Mean

instance AEq Mean where
NoMean === NoMean = True
NoMean === _ = False
_ === NoMean = False
(Mean x) === (Mean y) = x === y

NoMean ~== NoMean = True
NoMean ~== _ = False
_ ~== NoMean = False
(Mean x) ~== (Mean y) = x ~== y

data Median =
Median {-# UNPACK #-} !Double
| NoMedian
Expand All @@ -114,6 +140,11 @@ newtype StdDevAcc =

instance NFData StdDevAcc

instance AEq StdDevAcc where
(===) = (==)

(StdDevAcc x) ~== (StdDevAcc y) = x ~== y

newtype Variance =
Variance {
unVariance :: Double
Expand All @@ -128,6 +159,17 @@ data StdDev =

instance NFData StdDev

instance AEq StdDev where
NoStdDev === NoStdDev = True
NoStdDev === _ = False
_ === NoStdDev = False
(StdDev x) === (StdDev y) = x === y

NoStdDev ~== NoStdDev = True
NoStdDev ~== _ = False
_ ~== NoStdDev = False
(StdDev x) ~== (StdDev y) = x ~== y

mkStdDev :: Double -> StdDev
mkStdDev v
| v < 0.0 = NoStdDev
Expand All @@ -151,6 +193,18 @@ data MeanDevAcc =

instance NFData MeanDevAcc

instance AEq MeanDevAcc where
(===) = (==)

MeanDevInitial ~== MeanDevInitial = True
MeanDevInitial ~== _ = False
_ ~== MeanDevInitial = False
(MeanDevAcc mu1 s21 n1) ~== (MeanDevAcc mu2 s22 n2) = and [
mu1 ~== mu2
, s21 ~== s22
, n1 == n2
]

data NumericState =
NumericState {
_stateMinimum :: !Minimum
Expand All @@ -162,6 +216,15 @@ instance NFData NumericState

makeLenses ''NumericState

instance AEq NumericState where
(===) = (==)

(NumericState mn1 mx1 mda1) ~== (NumericState mn2 mx2 mda2) = and [
mn1 ~== mn2
, mx1 ~== mx2
, mda1 ~== mda2
]

initialNumericState :: NumericState
initialNumericState =
NumericState
Expand Down
10 changes: 4 additions & 6 deletions src/Warden/Inference.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ module Warden.Inference (
, viewMarkerMismatch
) where

import Control.Lens ((^.), view)

import Data.List (zip)
import Data.List.NonEmpty (NonEmpty(..), nonEmpty)
import qualified Data.List.NonEmpty as NE
Expand Down Expand Up @@ -58,7 +56,7 @@ viewMarkerMismatch a b = do
then pure ()
else Left $ ViewMarkerMismatch ctx (T.pack $ show x) (T.pack $ show y)

fields' vm' = (vmViewCounts $ vmMetadata vm') ^. numFields
fields' = rcsNumFields . vmViewCounts . vmMetadata

fft' = checkFreeformThreshold . vmCheckParams . vmMetadata

Expand All @@ -82,14 +80,14 @@ validateViewMarkers (m:|ms) = go m ms >> checkFails
fieldLookSum :: ValidViewMarkers -> FieldLookCount
fieldLookSum =
foldl' combineFieldLooks NoFieldLookCount .
fmap (view fieldLooks . vmViewCounts . vmMetadata) . unValidViewMarkers
fmap (rcsFieldLooks . vmViewCounts . vmMetadata) . unValidViewMarkers

textCountSum :: ValidViewMarkers -> TextCounts
textCountSum (ValidViewMarkers vms) =
-- FFTs already validated as the same
let fft = checkFreeformThreshold . vmCheckParams . vmMetadata $ NE.head vms in
foldl' (combineTextCounts fft) NoTextCounts $
fmap (view textCounts . vmViewCounts . vmMetadata) vms
fmap (rcsTextCounts . vmViewCounts . vmMetadata) vms

inferForms :: ValidViewMarkers -> Either InferenceError TextCountSummary
inferForms vms =
Expand Down Expand Up @@ -135,7 +133,7 @@ countCompatibleFields vms =
Right $ V.map countsForField fls

totalViewRows :: ValidViewMarkers -> RowCount
totalViewRows = sum . fmap (view totalRows . vmViewCounts . vmMetadata) . unValidViewMarkers
totalViewRows = sum . fmap (rcsTotalRows . vmViewCounts . vmMetadata) . unValidViewMarkers

normalizeFieldHistogram :: RowCount -> FieldHistogram -> Either InferenceError (VU.Vector NormalizedEntries)
normalizeFieldHistogram (RowCount rc) (FieldHistogram cs) = do
Expand Down
12 changes: 11 additions & 1 deletion src/Warden/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
module Warden.Numeric (
combineMeanAcc
, combineMeanDevAcc
, combineNumericState
, combineStdDevAcc
, finalizeMeanDev
, finalizeStdDevAcc
Expand Down Expand Up @@ -36,7 +37,7 @@ updateMaximum !acc x =
in acc <> x'
{-# INLINE updateMaximum #-}

-- Minimal-error mean and variance.
-- | Minimal-error mean and standard deviation.
--
-- From Knuth (TAoCP v2, Seminumerical Algorithms, p232).
--
Expand Down Expand Up @@ -169,3 +170,12 @@ stdDevAccFromVariance (KAcc n) (Variance var) =
stdDevFromVariance :: Variance -> StdDev
stdDevFromVariance = StdDev . sqrt . unVariance
{-# INLINE stdDevFromVariance #-}

-- FIXME: not associative
combineNumericState :: NumericState -> NumericState -> NumericState
combineNumericState ns1 ns2 =
(stateMinimum %~ (<> (ns1 ^. stateMinimum)))
. (stateMaximum %~ (<> (ns1 ^. stateMaximum)))
. (stateMeanDev %~ (combineMeanDevAcc (ns1 ^. stateMeanDev)))
$!! ns2
{-# INLINE combineNumericState #-}
5 changes: 5 additions & 0 deletions src/Warden/Sampling/Reservoir.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ module Warden.Sampling.Reservoir (
, xQuantile
) where

import Data.AEq (AEq, (===), (~==))
import Data.Vector.Unboxed (Vector)

import P
Expand All @@ -47,6 +48,10 @@ newtype Probability =
unProbability :: Double
} deriving (Eq, Show)

instance AEq Probability where
(Probability p) === (Probability q) = p === q
(Probability p) ~== (Probability q) = p ~== q

probability :: Double -> Maybe Probability
probability p
| p < 0 || p > 1 = Nothing
Expand Down
Loading

0 comments on commit 94216ad

Please sign in to comment.