Skip to content

Commit

Permalink
DE-Numeral-complex-German-numerals (#699)
Browse files Browse the repository at this point in the history
Summary:
The commit adds a rule and an underlying parser for German numeral expressions representing (integer) numbers smaller than 1 million. Other than in English, those numbers are represented by single words, e.g. "neunhundertsiebenundachtzigtausendsechshundertvierundfünfzig" (987654). Other rukes are simplified or removed to eliminate redundancies.

Pull Request resolved: #699

Reviewed By: patapizza

Differential Revision: D37716120

Pulled By: stroxler

fbshipit-source-id: 90b26e253259c5bc1aaa76f3972537c2361f6bb3
  • Loading branch information
andhai authored and facebook-github-bot committed Jul 14, 2022
1 parent 1faab00 commit 9509e04
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 183 deletions.
32 changes: 21 additions & 11 deletions Duckling/Numeral/DE/Corpus.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ allExamples = concat
, examples (NumeralValue 1)
[ "1"
, "eins"
, "Eine"
, "einen"
]
, examples (NumeralValue 3)
[ "3"
Expand All @@ -46,11 +48,12 @@ allExamples = concat
]
, examples (NumeralValue 30)
[ "30"
, "dreißig"
, "dreissig"
]
, examples (NumeralValue 33)
[ "33"
, "drei Und dreissig"
, "dreiunddreißig"
, "dreiunddreissig"
, "0033"
]
Expand All @@ -76,11 +79,11 @@ allExamples = concat
]
, examples (NumeralValue 200)
[ "200"
, "zwei hundert"
, "zweihundert"
]
, examples (NumeralValue 102)
[ "102"
, "Hundert zwei"
, "Hundertzwei"
]
, examples (NumeralValue 1.1)
[ "1,1"
Expand All @@ -97,12 +100,15 @@ allExamples = concat
, "100000"
, "100K"
, "100k"
, "einhunderttausend"
, "hunderttausend"
]
, examples (NumeralValue 3000000)
[ "3M"
, "3000K"
, "3000000"
, "3.000.000"
, "drei Millionen"
]
, examples (NumeralValue 1200000)
[ "1.200.000"
Expand All @@ -120,30 +126,34 @@ allExamples = concat
, "-1200K"
, "-,0012G"
]
, examples (NumeralValue 1852)
[ "eintausendachthundertzweiundfünfzig"
, "tausendachthundertzweiundfünfzig"
, "achtzehnhundertzweiundfünfzig"]
, examples (NumeralValue 5000)
[ "5 tausend"
, "fünf tausend"
, "fünftausend"
]
, examples (NumeralValue 200000)
[ "zwei hundert tausend"
[ "zweihunderttausend"
]
, examples (NumeralValue 721012)
[ "sieben hundert einundzwanzig tausend zwölf"
[ "siebenhunderteinundzwanzigtausendzwölf"
, "siebenhunderteinundzwanzigtausendundzwölf"
]
, examples (NumeralValue 31256721)
[ "ein und dreissig millionen zwei hundert sechs und fünfzig tausend sieben hundert ein und zwanzig"
[ "einunddreissig millionen zweihundertsechsundfünfzigtausendsiebenhunderteinundzwanzig"
, "einunddreißig Millionen zweihundertsechsundfünfzigtausendundsiebenhunderteinundzwanzig"
]
, examples (NumeralValue 1416.15)
[ "1416,15"
]
, examples (NumeralValue 1416.15)
[ "1.416,15"
, "tausendvierhundertsechzehn Komma fünfzehn"
]
, examples (NumeralValue 1000000.0)
[ "1.000.000,00",
"eine million"
]
, examples (NumeralValue 2771090092000000.0)
[ "zwei billiarden sieben hundert ein und siebzig billionen neunzig milliarden zwei und neunzig millionen"
[ "zwei billiarden siebenhunderteinundsiebzig billionen neunzig milliarden zweiundneunzig millionen"
]
]
160 changes: 160 additions & 0 deletions Duckling/Numeral/DE/NumParser.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
{-# LANGUAGE DeriveFunctor #-}

module Duckling.Numeral.DE.NumParser (parseNumeral) where

import Prelude
import Control.Applicative
import Data.Char
import Data.List
import Data.Foldable
import Data.String

newtype Parser a
= Parser { runParser :: String -> Maybe (a, String) }
deriving Functor

char :: Char -> Parser Char
char c = Parser p
where
p [] = Nothing
p (x:xs)
| x == c = Just (x, xs)
| otherwise = Nothing

instance Applicative Parser where
pure a = Parser (\s -> Just (a, s))
(Parser fp) <*> xp = Parser $ \s ->
case fp s of
Nothing -> Nothing
Just (f,s') -> runParser (f <$> xp) s'

instance Alternative Parser where
empty = Parser (const Nothing)
Parser p1 <|> Parser p2 = Parser $ liftA2 (<|>) p1 p2

type NumParser = Parser Integer

(.+.) :: NumParser -> NumParser -> NumParser
p .+. p' = (+) <$> p <*> p'

(.*.) :: NumParser -> NumParser -> NumParser
p .*. p' = (*) <$> p <*> p'

infixl 6 .+.
infixl 7 .*.

opt :: NumParser -> NumParser
opt p = p <|> Parser p'
where
p' s = Just (0, s)

data NumItem = NumItem { base :: NumParser
, plus10 :: NumParser
, times10 :: [NumParser]
}

defaultNumItem :: Integer -> String -> NumItem
defaultNumItem value form = NumItem { base = p
, plus10 = p .+. ten
, times10 = [p .*. ty]
} where p = assign value form

type Assignment = Integer -> String -> NumParser

assign :: Assignment
assign value = foldr (\c p -> (1 <$ char c) .*. p) (pure value)

ten :: NumParser
ten = assign 10 "zehn"

ty :: NumParser
ty = assign 10 "zig"

hundred :: NumParser
hundred = assign 100 "hundert"

thousand :: NumParser
thousand = assign 1000 "tausend"

und :: NumParser
und = assign 0 "und"

one :: NumItem
one = (defaultNumItem 1 "ein") { plus10 = assign 11 "elf"
, times10 = [ ten ] }

two :: NumItem
two = (defaultNumItem 2 "zwei") { plus10 = assign 12 "zwölf"
, times10 = [ assign 20 "zwanzig" ] }
three :: NumItem
three = (defaultNumItem 3 "drei") { times10 = [ assign 30 "dreißig"
, assign 30 "dreissig" ] }

four :: NumItem
four = defaultNumItem 4 "vier"

five :: NumItem
five = defaultNumItem 5 "fünf"

six :: NumItem
six = (defaultNumItem 6 "sechs") { plus10 = assign 16 "sechzehn"
, times10 = [ assign 60 "sechzig" ] }

seven :: NumItem
seven = (defaultNumItem 7 "sieben") { plus10 = assign 17 "siebzehn"
, times10 = [ assign 70 "siebzig" ] }

eight :: NumItem
eight = defaultNumItem 8 "acht"

nine :: NumItem
nine = defaultNumItem 9 "neun"

digitLexicon :: [NumItem]
digitLexicon = [one, two, three, four, five, six, seven, eight, nine]

from1to9 :: NumParser
from1to9 = foldr ((<|>) . base) empty digitLexicon

tensFrom20 :: NumParser
tensFrom20 = asum (concatMap times10 (tail digitLexicon))

from1to99 :: NumParser
from1to99 = opt (from1to9 .+. und) .+. tensFrom20
<|> foldr ((<|>) . plus10) empty digitLexicon
<|> ten
<|> from1to9

from1to999 :: NumParser
from1to999 = opt (from1to9 .*. hundred .+. opt und) .+. opt from1to99

from1to999999 :: NumParser
from1to999999 = opt (from1to999 .*. thousand .+. opt und) .+. opt from1to999

from1to999999' :: NumParser
from1to999999' = Parser p
where
p s
| isPrefixOf "hundert" s || isPrefixOf "tausend" s
= runParser from1to999999 ("ein" ++ s)
| otherwise
= runParser from1to999999 s

fromYear1100to1999 :: NumParser
fromYear1100to1999 = asum ((\n -> plus10 n .*. hundred) <$> digitLexicon)
.+. opt (opt und .+. from1to99)

allNumerals :: NumParser
allNumerals = fromYear1100to1999
<|> from1to999999'

removeInflection :: (Integer, String) -> Maybe Integer
removeInflection (n, suffix)
| n `mod` 10 == 1 && suffix `elem` inflection = Just n
where
inflection = ["s", "e", "em", "en", "er", "es"]
removeInflection (n, "") = Just n
removeInflection _ = Nothing

parseNumeral :: String -> Maybe Integer
parseNumeral s = removeInflection =<< runParser allNumerals s
Loading

0 comments on commit 9509e04

Please sign in to comment.