From 28b6403995eba5755bbf8e17fb86187c6ac21363 Mon Sep 17 00:00:00 2001 From: "James M. Allen" Date: Fri, 21 Apr 2017 12:08:15 -0400 Subject: [PATCH 1/8] Added Georgian language and tests --- data/languagefiles/ka.yaml | 92 ++++++++++++++++++++++++++++++++++++++ data/languages.yaml | 3 +- tests/test_date_parser.py | 3 ++ tests/test_languages.py | 7 ++- 4 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 data/languagefiles/ka.yaml diff --git a/data/languagefiles/ka.yaml b/data/languagefiles/ka.yaml new file mode 100644 index 000000000..a90fcc652 --- /dev/null +++ b/data/languagefiles/ka.yaml @@ -0,0 +1,92 @@ +name: Georgian + +dateorder: DMY + +skip: ["ის", "ზე", "დაახლოებით", "და"] + +monday: + - ორშაბათი + - ორშ +tuesday: + - სამშაბათი + - სამ +wednesday: + - ოთხშაბათი + - ოთხ +thursday: + - ხუთშაბათი + - ხუთ +friday: + - პარასკევი + - პარ +saturday: + - შაბათი + - შაბ +sunday: + - კვირა + - კვი + +january: + - იანვარი + - იან +february: + - თებერვალი + - თებ +march: + - მარტი + - მარ +april: + - აპრილი + - აპრ +may: + - მაისი + - მაი +june: + - ივნისი + - ივნ +july: + - ივლისი + - ივლ +august: + - აგვისტო + - აგვ +september: + - სექტემბერი + - სექ +october: + - ოქტომბერი + - ოქტ +november: + - ნოემბერი + - ნოე +december: + - დეკემბერი + - დეკ + +year: + - წლის + - წ. +month: + - თვე +week: + - კვირა +day: + - დღე +hour: + - საათი +minute: + - წუთი +second: + - წამი + +ago: + - წინ +in: + - დღეიდან + +simplifications: + - ახლა: now + - გუშინ: 1 დღე + - ხვალ: დღეიდან 1 დღე + - დღეს: 0 დღე + - ერთ: 1 diff --git a/data/languages.yaml b/data/languages.yaml index 0344d0c3c..d8e2201ba 100644 --- a/data/languages.yaml +++ b/data/languages.yaml @@ -1,7 +1,7 @@ base: skip: [" ", ".", ",", ";", "-", "/", "'", "|", "@", "[", "]", ","] -languageorder: ['en', 'ar', 'be', 'bg', 'bn', 'cs', 'da', 'de', 'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'nl', 'pl', 'pt', 'ro', 'ru', 'th', 'tl', 'tr', 'uk', 'vi', 'zh'] +languageorder: ['en', 'ar', 'be', 'bg', 'bn', 'cs', 'da', 'de', 'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'nl', 'pl', 'pt', 'ro', 'ru', 'th', 'tl', 'tr', 'uk', 'vi', 'zh'] ar: !include languagefiles/ar.yaml be: !include languagefiles/be.yaml @@ -21,6 +21,7 @@ hu: !include languagefiles/hu.yaml id: !include languagefiles/id.yaml it: !include languagefiles/it.yaml ja: !include languagefiles/ja.yaml +ka: !include languagefiles/ka.yaml nl: !include languagefiles/nl.yaml pl: !include languagefiles/pl.yaml pt: !include languagefiles/pt.yaml diff --git a/tests/test_date_parser.py b/tests/test_date_parser.py index 2e5a541cd..65a362429 100644 --- a/tests/test_date_parser.py +++ b/tests/test_date_parser.py @@ -360,6 +360,9 @@ def setUp(self): param('ग्यारह जुलाई 1994, 11:12',datetime(1994, 7, 11, 11, 12)), param('१७ अक्टूबर २०१८',datetime(2018, 10, 17, 0, 0)), param('12 जनवरी 1997 11:08 अपराह्न',datetime(1997, 1, 12, 23, 8)), + # Georgian dates + param('2011 წლის 17 მარტი, ოთხშაბათი', datetime(2011, 3, 17, 0, 0)), + param('2015 წ. 12 ივნ, 15:34', datetime(2015, 6, 12, 15, 34)) ]) def test_dates_parsing(self, date_string, expected): self.given_parser(settings={'NORMALIZE': False, diff --git a/tests/test_languages.py b/tests/test_languages.py index 240aa9542..f77e1435e 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -174,7 +174,7 @@ def setUp(self): # Bangla param('bn', "সেপ্টেম্বর 03 2014", "september 03 2014"), param('bn', "শুক্রবার, 03 সেপ্টেম্বর 2014", "friday 03 september 2014"), - + #Hindi param('hi', 'सोमवार 13 जून 1998','monday 13 june 1998'), param('hi', 'मंगल 16 1786 12:18','tuesday 16 1786 12:18'), @@ -407,6 +407,11 @@ def test_translation(self, shortname, datetime_string, expected_translation): param('hi', "सन् १९२०"," 1920"), param('hi',"आठ पूर्वाह्न","8 am"), param('hi',"बारह सेकंड पूर्व","12 second ago"), + # Georgian + param('ka', 'გუშინ', '1 day'), + param('ka', 'დღეს', '0 day'), + param('ka', 'ერთ თვე', '1 month'), + param('ka', 'დღეიდან ერთ კვირა', 'in 1 week'), ]) def test_freshness_translation(self, shortname, datetime_string, expected_translation): # Finnish language use "t" as hour, so empty SKIP_TOKENS. From ea745b14c1ecce9359861b416df092b62552cff7 Mon Sep 17 00:00:00 2001 From: Christian Stuart Date: Thu, 11 May 2017 14:57:26 +0200 Subject: [PATCH 2/8] language.simplify regex memoization --- dateparser/languages/language.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/dateparser/languages/language.py b/dateparser/languages/language.py index 80a182573..29445fa00 100644 --- a/dateparser/languages/language.py +++ b/dateparser/languages/language.py @@ -16,6 +16,7 @@ class Language(object): _dictionary = None _normalized_dictionary = None _simplifications = None + _simplification_patterns = None _normalized_simplifications = None _splitters = None _wordchars = None @@ -59,14 +60,24 @@ def translate(self, date_string, keep_formatting=False, settings=None): def _simplify(self, date_string, settings=None): date_string = date_string.lower() for simplification in self._get_simplifications(settings=settings): - pattern, replacement = list(simplification.items())[0] - if not self.info.get('no_word_spacing', False): - replacement = wrap_replacement_for_regex(replacement, pattern) - pattern = r'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern - date_string = re.sub( - pattern, replacement, date_string, flags=re.IGNORECASE | re.UNICODE).lower() + pattern, replacement = self._get_simplification_substitution(simplification) + date_string = pattern.sub(replacement, date_string).lower() return date_string + def _get_simplification_substitution(self, simplification): + pattern, replacement = list(simplification.items())[0] + if not self.info.get('no_word_spacing', False): + replacement = wrap_replacement_for_regex(replacement, pattern) + pattern = r'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern + + if self._simplification_patterns is None: + self._simplification_patterns = {} + + if pattern not in self._simplification_patterns: + self._simplification_patterns[pattern] = re.compile(pattern, flags=re.IGNORECASE | re.UNICODE) + pattern = self._simplification_patterns[pattern] + return pattern, replacement + def _clear_future_words(self, words): freshness_words = set(['day', 'week', 'month', 'year', 'hour', 'minute', 'second']) if set(words).isdisjoint(freshness_words): From bbc6d92b010c066b6436a617c1edebbdab9f2ca4 Mon Sep 17 00:00:00 2001 From: sarthak5 Date: Sun, 14 May 2017 22:42:00 +0530 Subject: [PATCH 3/8] resolved failing test for python 3.6 --- tests/test_freshness_date_parser.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_freshness_date_parser.py b/tests/test_freshness_date_parser.py index 04448c6ec..b257b245e 100644 --- a/tests/test_freshness_date_parser.py +++ b/tests/test_freshness_date_parser.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta, date, time from functools import wraps import pytz +import regex as re from dateutil.relativedelta import relativedelta from mock import Mock, patch @@ -308,7 +309,7 @@ def setUp(self): ago={'years': 1, 'months': 1, 'weeks': 1, 'days': 1, 'hours': 1, 'minutes': 1}, period='day'), # param('এখন', ago={'seconds': 0}, period='day'), - + # Hindi dates param('1 घंटे पहले', ago={'hours': 1},period='day'), param('15 मिनट पहले',ago={'minutes':15},period='day'), @@ -317,7 +318,7 @@ def setUp(self): param('1 वर्ष 7 महीने', ago={'years': 1, 'months': 7}, period='month'), param('आज', ago={'days': 0}, period='day'), ]) - + def test_relative_past_dates(self, date_string, ago, period): self.given_parser(settings={'NORMALIZE': False}) self.given_date_string(date_string) @@ -581,7 +582,7 @@ def test_relative_past_dates(self, date_string, ago, period): ago={'years': 1, 'months': 1, 'weeks': 1, 'days': 1, 'hours': 1, 'minutes': 1}, period='day'), # param('এখন', ago={'seconds': 0}, period='day'), - + # Hindi dates param('1 घंटे पहले', ago={'hours': 1},period='day'), param('15 मिनट पहले',ago={'minutes':15},period='day'), @@ -680,7 +681,7 @@ def test_normalized_relative_dates(self, date_string, ago, period): param('gelecek hafta', in_future={'weeks': 1}, period='week'), param('gelecek ay', in_future={'months': 1}, period='month'), param('gelecek yıl', in_future={'years': 1}, period='year'), - + #Hindi dates #param('1 वर्ष 10 महीने में', in_future={'years': 1, 'months': 10}, period='month'), param('15 घंटे बाद', in_future={'hours': 15}, period='day'), @@ -716,6 +717,7 @@ def test_dates_not_supported_by_date_time(self, date_string): self.given_parser() self.given_date_string(date_string) self.when_date_is_parsed() + self.error = ValueError(re.sub('year \d+ is out of range','year is out of range',str(self.error))) self.then_error_was_raised(ValueError, ['year is out of range', "('year must be in 1..9999'"]) From aeed6180d33554fafaac3de8a61a7a18ca2c3ab3 Mon Sep 17 00:00:00 2001 From: sarthak5 Date: Mon, 15 May 2017 15:45:49 +0530 Subject: [PATCH 4/8] added necessary changes --- tests/test_freshness_date_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_freshness_date_parser.py b/tests/test_freshness_date_parser.py index b257b245e..e9894e0e8 100644 --- a/tests/test_freshness_date_parser.py +++ b/tests/test_freshness_date_parser.py @@ -717,7 +717,8 @@ def test_dates_not_supported_by_date_time(self, date_string): self.given_parser() self.given_date_string(date_string) self.when_date_is_parsed() - self.error = ValueError(re.sub('year \d+ is out of range','year is out of range',str(self.error))) + if isinstance(self.error, ValueError): + self.error = ValueError(re.sub('year [-+]*\d+ is out of range','year is out of range',str(self.error))) self.then_error_was_raised(ValueError, ['year is out of range', "('year must be in 1..9999'"]) From b1e92f39d6a2b97ba05165b9f237a4fc496cb758 Mon Sep 17 00:00:00 2001 From: "James M. Allen" Date: Wed, 17 May 2017 15:23:10 -0400 Subject: [PATCH 5/8] empty file to trigger re-run --- rerun | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 rerun diff --git a/rerun b/rerun new file mode 100644 index 000000000..e69de29bb From e165972add35fce629d2f3b187e63fbfdf33846b Mon Sep 17 00:00:00 2001 From: "James M. Allen" Date: Wed, 17 May 2017 15:23:20 -0400 Subject: [PATCH 6/8] removing empty file --- rerun | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 rerun diff --git a/rerun b/rerun deleted file mode 100644 index e69de29bb..000000000 From 2366409c4b39d90a8db3d34e7b5cc1efdc5334f3 Mon Sep 17 00:00:00 2001 From: Waqas Shabir Date: Sun, 21 May 2017 15:39:38 -0400 Subject: [PATCH 7/8] Modify test_dates_not_supported_by_date_time()... ... to check for just a substring instead --- tests/test_freshness_date_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_freshness_date_parser.py b/tests/test_freshness_date_parser.py index e9894e0e8..6b0f3c99c 100644 --- a/tests/test_freshness_date_parser.py +++ b/tests/test_freshness_date_parser.py @@ -717,9 +717,7 @@ def test_dates_not_supported_by_date_time(self, date_string): self.given_parser() self.given_date_string(date_string) self.when_date_is_parsed() - if isinstance(self.error, ValueError): - self.error = ValueError(re.sub('year [-+]*\d+ is out of range','year is out of range',str(self.error))) - self.then_error_was_raised(ValueError, ['year is out of range', + self.then_error_was_raised(ValueError, ['is out of range', "('year must be in 1..9999'"]) @parameterized.expand([ From 063ba4ad3e2a04a1e129f2c6156f9c85eb5fb0cc Mon Sep 17 00:00:00 2001 From: Waqas Shabir Date: Sun, 21 May 2017 16:38:18 -0400 Subject: [PATCH 8/8] Add georgian to supported languages in readme. --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index caa923fb5..922e81649 100644 --- a/README.rst +++ b/README.rst @@ -254,6 +254,7 @@ Supported languages * Hebrew * Hindi * Hungarian +* Georgian * German * Indonesian * Italian