Skip to content

Commit

Permalink
Merge pull request #102 from ambiata/topic/numeric-fold
Browse files Browse the repository at this point in the history
Inlude numerics in fold and summary
  • Loading branch information
olorin committed Apr 5, 2016
2 parents 94216ad + 1ed6f0c commit 1af83b3
Show file tree
Hide file tree
Showing 20 changed files with 818 additions and 174 deletions.
22 changes: 7 additions & 15 deletions src/Warden/Data/Marker.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@ module Warden.Data.Marker (
, FileMarker(..)
, MarkerFailure(..)
, MarkerStatus(..)
, MarkerVersion(..)
, RowCountSummary(..)
, ViewMarker(..)
, ViewMetadata(..)
, currentMarkerVersion
, dateRange
, filePathChar
, fileToMarker
Expand Down Expand Up @@ -42,18 +40,12 @@ import System.IO (FilePath)
import P

import Warden.Data.Check
import Warden.Data.Numeric
import Warden.Data.Param
import Warden.Data.Row
import Warden.Data.TextCounts
import Warden.Data.View

data MarkerVersion =
MarkerV1
deriving (Eq, Show, Ord, Bounded, Enum)

currentMarkerVersion :: MarkerVersion
currentMarkerVersion = maxBound

data CheckResultType =
FileResult
| RowResult
Expand Down Expand Up @@ -108,8 +100,7 @@ summarizeResult typ dsc st =

data FileMarker =
FileMarker {
fmVersion :: !MarkerVersion
, fmWardenParams :: !WardenParams
fmWardenParams :: !WardenParams
, fmViewFile :: !ViewFile
, fmTimestamp :: !DateTime
, fmCheckResults :: ![CheckResultSummary]
Expand All @@ -123,7 +114,7 @@ mkFileMarker :: WardenParams
-> FileMarker
mkFileMarker wps v dsc dt cs =
let crs = [summarizeResult FileResult dsc cs] in
FileMarker currentMarkerVersion wps v dt crs
FileMarker wps v dt crs

markerSuffix :: FilePath
markerSuffix = ".warden"
Expand Down Expand Up @@ -185,8 +176,7 @@ filePathChar = satisfy (not . bad)

data ViewMarker =
ViewMarker {
vmVersion :: !MarkerVersion
, vmWardenParams :: !WardenParams
vmWardenParams :: !WardenParams
, vmView :: !View
, vmTimestamp :: !DateTime
, vmCheckResults :: ![CheckResultSummary]
Expand All @@ -202,7 +192,7 @@ mkViewMarker :: WardenParams
-> ViewMarker
mkViewMarker wps v dsc dt vm cs =
let crs = [summarizeResult RowResult dsc cs] in
ViewMarker currentMarkerVersion wps v dt crs vm
ViewMarker wps v dt crs vm

data RowCountSummary =
RowCountSummary {
Expand All @@ -211,6 +201,7 @@ data RowCountSummary =
, rcsNumFields :: !(Set FieldCount)
, rcsFieldLooks :: !FieldLookCount
, rcsTextCounts :: !TextCounts
, rcsNumericSummaries :: !NumericFieldSummary
} deriving (Eq, Show)

summarizeSVParseState :: SVParseState -> RowCountSummary
Expand All @@ -221,6 +212,7 @@ summarizeSVParseState ps =
(ps ^. numFields)
(ps ^. fieldLooks)
(ps ^. textCounts)
(summarizeFieldNumericState $ ps ^. numericState)

data ViewMetadata =
ViewMetadata {
Expand Down
90 changes: 83 additions & 7 deletions src/Warden/Data/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,38 @@
{-# LANGUAGE DeriveGeneric #-}

module Warden.Data.Numeric (
KAcc(..)
FieldNumericState(..)
, KAcc(..)
, Maximum(..)
, Mean(..)
, MeanAcc(..)
, MeanDevAcc(..)
, Median(..)
, Minimum(..)
, NumericField(..)
, NumericState(..)
, NumericFieldSummary(..)
, NumericSummary(..)
, StdDev(..)
, StdDevAcc(..)
, Variance(..)
, finalizeMeanDev
, finalizeStdDevAcc
, initialNumericState
, mkStdDev
, stdDevAccFromVariance
, stateMaximum
, stateMeanDev
, stateMinimum
, summarizeFieldNumericState
, summarizeNumericState
, varianceFromStdDevAcc
) where

import Control.Lens (makeLenses)
import Control.Lens (makeLenses, (^.))

import Data.AEq (AEq, (===), (~==))
import qualified Data.Vector as V

import GHC.Generics (Generic)

Expand Down Expand Up @@ -177,15 +187,27 @@ mkStdDev v

-- | So we can cheaply keep track of long-term change in numeric datasets.
-- Will probably also end up in brandix.
data NumericSummary = NumericSummary !Minimum
!Maximum
!Mean
!StdDev
!Median
data NumericSummary =
NoNumericSummary
| NumericSummary !Minimum !Maximum !Mean !StdDev !Median
deriving (Eq, Show, Generic)

instance NFData NumericSummary

data FieldNumericState =
FieldNumericState !(V.Vector NumericState)
| NoFieldNumericState
deriving (Eq, Show, Generic)

instance NFData FieldNumericState

data NumericFieldSummary =
NumericFieldSummary !(V.Vector NumericSummary)
| NoNumericFieldSummary
deriving (Eq, Show, Generic)

instance NFData NumericFieldSummary

data MeanDevAcc =
MeanDevInitial
| MeanDevAcc {-# UNPACK #-} !MeanAcc !(Maybe StdDevAcc) {-# UNPACK #-} !KAcc
Expand Down Expand Up @@ -231,3 +253,57 @@ initialNumericState =
NoMinimum
NoMaximum
MeanDevInitial

-- | For numeric purposes we treat all numbers as reals.
newtype NumericField =
NumericField {
unNumericField :: Double
} deriving (Eq, Show)

summarizeNumericState :: NumericState -> NumericSummary
summarizeNumericState st =
if st == initialNumericState
-- We didn't see any numeric fields, so there's nothing to summarize.
then NoNumericSummary
else let (mn, stddev) = finalizeMeanDev $ st ^. stateMeanDev in
NumericSummary
(st ^. stateMinimum)
(st ^. stateMaximum)
mn
stddev
NoMedian

summarizeFieldNumericState :: FieldNumericState -> NumericFieldSummary
summarizeFieldNumericState NoFieldNumericState =
NoNumericFieldSummary
summarizeFieldNumericState (FieldNumericState ss) =
if V.null ss
then
NoNumericFieldSummary
else
NumericFieldSummary . V.map summarizeNumericState $ ss

varianceFromStdDevAcc :: KAcc -> StdDevAcc -> Variance
varianceFromStdDevAcc (KAcc n) (StdDevAcc sda) =
Variance $ sda / fromIntegral (n - 1)
{-# INLINE varianceFromStdDevAcc #-}

stdDevAccFromVariance :: KAcc -> Variance -> StdDevAcc
stdDevAccFromVariance (KAcc n) (Variance var) =
StdDevAcc $ var * fromIntegral (n - 1)
{-# INLINE stdDevAccFromVariance #-}

stdDevFromVariance :: Variance -> StdDev
stdDevFromVariance = StdDev . sqrt . unVariance
{-# INLINE stdDevFromVariance #-}

finalizeStdDevAcc :: KAcc -> StdDevAcc -> StdDev
finalizeStdDevAcc ka sda =
stdDevFromVariance $ varianceFromStdDevAcc ka sda
{-# INLINE finalizeStdDevAcc #-}

finalizeMeanDev :: MeanDevAcc -> (Mean, StdDev)
finalizeMeanDev MeanDevInitial = (NoMean, NoStdDev)
finalizeMeanDev (MeanDevAcc _ Nothing _) = (NoMean, NoStdDev)
finalizeMeanDev (MeanDevAcc mn (Just sda) n) = (Mean (unMeanAcc mn), finalizeStdDevAcc n sda)
{-# INLINE finalizeMeanDev #-}
32 changes: 10 additions & 22 deletions src/Warden/Data/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,18 @@ module Warden.Data.Row (
, emptyLookCountVector
, fieldLooks
, initialSVParseState
, numericState
, numFields
, renderFieldCount
, renderObservationCount
, renderParsedField
, renderRowCount
, resolveSVParseState
, separatorToChar
, textCounts
, totalRows
) where

import Control.Lens (makeLenses, (^.), (%~))
import Control.Lens (makeLenses)

import Data.ByteString (ByteString)
import Data.Char (chr, ord)
Expand All @@ -57,6 +57,7 @@ import P
import Prelude (fromEnum)

import Warden.Data.Field
import Warden.Data.Numeric
import Warden.Data.TextCounts

newtype RawRecord =
Expand Down Expand Up @@ -161,31 +162,18 @@ instance NFData FieldLookCount

data SVParseState =
SVParseState {
_badRows :: {-# UNPACK #-} !RowCount
, _totalRows :: {-# UNPACK #-} !RowCount
, _numFields :: !(Set FieldCount)
, _fieldLooks :: !FieldLookCount
, _textCounts :: !TextCounts
_badRows :: {-# UNPACK #-} !RowCount
, _totalRows :: {-# UNPACK #-} !RowCount
, _numFields :: !(Set FieldCount)
, _fieldLooks :: !FieldLookCount
, _textCounts :: !TextCounts
, _numericState :: !FieldNumericState
} deriving (Eq, Show, Generic)

instance NFData SVParseState

makeLenses ''SVParseState

resolveSVParseState :: TextFreeformThreshold -> [SVParseState] -> SVParseState
resolveSVParseState fft = foldr update initialSVParseState
where
update s !acc =
(badRows %~ ((s ^. badRows) +))
. (totalRows %~ ((s ^. totalRows) +))
. (numFields %~ ((s ^. numFields) `S.union`))
. (fieldLooks %~ ((s ^. fieldLooks) `combineFieldLooks`))
. (textCounts %~ ((s ^. textCounts) `combineTextCounts'`))
$! acc

combineTextCounts' = combineTextCounts fft
{-# INLINE resolveSVParseState #-}

-- | We don't include a ParsedText here; Text is indicated by failure of
-- the parser.
data ParsedField =
Expand All @@ -202,4 +190,4 @@ renderParsedField = T.pack . show

initialSVParseState :: SVParseState
initialSVParseState =
SVParseState 0 0 S.empty NoFieldLookCount NoTextCounts
SVParseState 0 0 S.empty NoFieldLookCount NoTextCounts NoFieldNumericState
2 changes: 1 addition & 1 deletion src/Warden/Error.hs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ data TraversalError =
renderTraversalError :: TraversalError -> Text
renderTraversalError = ("traversal error: " <>) . render'
where
render' MaxDepthExceeded = "maximum traversal depth exceeded"
render' MaxDepthExceeded = "maximum traversal depth exceeded - make sure you're pointing to the top level of a view."
render' EmptyView = "no files found in view"
render' (NonViewFiles fs) =
"extra files which don't seem to be part of a view: "
Expand Down
3 changes: 0 additions & 3 deletions src/Warden/Inference.hs
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,10 @@ import Warden.Error
-- | Do these two markers look like they're compatible?
viewMarkerMismatch :: ViewMarker -> ViewMarker -> Either ValidationFailure ()
viewMarkerMismatch a b = do
validateVersion (vmVersion a) (vmVersion b)
validateView (vmView a) (vmView b)
validateTotalFields (fields' a) (fields' b)
validateFreeformThreshold (fft' a) (fft' b)
where
validateVersion = validateEq "vmVersion"

validateView = validateEq "vmView"

validateTotalFields = validateEq "numFields"
Expand Down
1 change: 1 addition & 0 deletions src/Warden/Marker.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
module Warden.Marker(
fileMarkerExists
, readFileMarker
, readFileMarker'
, readViewMarker
, viewMarkerExists
, writeFileMarker
Expand Down
Loading

0 comments on commit 1af83b3

Please sign in to comment.