Skip to content

Commit

Permalink
Merge pull request scrapinghub#99 from scrapinghub/py3-porting-at-eur…
Browse files Browse the repository at this point in the history
…opython

fixes to work under Python3 and PyPy
  • Loading branch information
asadurski committed Jul 29, 2015
2 parents 73aaca3 + 38b1c0b commit d360343
Show file tree
Hide file tree
Showing 15 changed files with 80 additions and 92 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ language: python

python:
- "2.7"
- "3.3"
- "3.4"
- "pypy"
env:
- TOXENV=py27
install:
Expand Down
2 changes: 1 addition & 1 deletion dateparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
__version__ = '0.2.1'

from date import DateDataParser
from .date import DateDataParser

_default_parser = DateDataParser(allow_redetect_language=True)

Expand Down
14 changes: 5 additions & 9 deletions dateparser/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import calendar
import collections
import re
import six
from datetime import datetime, timedelta
from types import NoneType
from warnings import warn

from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -99,10 +99,6 @@ def parse_with_formats(date_string, date_formats):
:returns: :class:`datetime.datetime`, dict or None
"""
# Encode to support locale setting in spiders
if isinstance(date_string, unicode):
date_string = date_string.encode('utf-8')

period = 'day'
for date_format in date_formats:
try:
Expand Down Expand Up @@ -130,10 +126,10 @@ class _DateLanguageParser(object):
DATE_FORMATS_ERROR_MESSAGE = "Date formats should be list, tuple or set of strings"

def __init__(self, language, date_string, date_formats):
if isinstance(date_formats, basestring):
if isinstance(date_formats, six.string_types):
warn(self.DATE_FORMATS_ERROR_MESSAGE, FutureWarning)
date_formats = [date_formats]
elif not isinstance(date_formats, (list, tuple, collections.Set, NoneType)):
elif not (date_formats is None or isinstance(date_formats, (list, tuple, collections.Set))):
raise TypeError(self.DATE_FORMATS_ERROR_MESSAGE)

self.language = language
Expand Down Expand Up @@ -261,13 +257,13 @@ def __init__(self, languages=None, allow_redetect_language=False):

if allow_redetect_language:
self.language_detector = AutoDetectLanguage(
languages if languages else available_language_map.values(),
languages if languages else list(available_language_map.values()),
allow_redetection=True)
elif languages:
self.language_detector = ExactLanguages(languages=languages)
else:
self.language_detector = AutoDetectLanguage(
available_language_map.values(), allow_redetection=False)
list(available_language_map.values()), allow_redetection=False)

def get_date_data(self, date_string, date_formats=None):
"""
Expand Down
38 changes: 10 additions & 28 deletions dateparser/date_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,22 @@
from __future__ import unicode_literals

import calendar
import re, sys
import re
import sys
from datetime import datetime
from collections import OrderedDict

import six
from dateutil import parser
from dateutil.relativedelta import relativedelta
from dateparser.timezone_parser import pop_tz_offset_from_string, convert_to_local_tz

from conf import settings
from .conf import settings


binary_type = bytes if sys.version_info[0] == 3 else str


class new_relativedelta(relativedelta):
""" dateutil does not check if result of parsing weekday is in the future.
Although items dates are already in the past, so we need to fix this particular case.
"""

def __new__(cls, *args, **kwargs):
if not args and len(kwargs) == 1 and 'weekday' in kwargs:
return super(new_relativedelta, cls).__new__(cls, *args, **kwargs)
else:
# use original class to parse other cases
return relativedelta(*args, **kwargs)

def __add__(self, other):
ret = super(new_relativedelta, self).__add__(other)
if ret > datetime.utcnow():
ret -= relativedelta(days=7)
return ret


parser.relativedelta.relativedelta = new_relativedelta


class new_parser(parser.parser):
"""
Implements an alternate parse method which supports preference to dates in future and past.
Expand Down Expand Up @@ -72,7 +52,7 @@ def get_period(res):
('month', ['month']),
('year', ['year']),
])
for period, markers in periods.iteritems():
for period, markers in six.iteritems(periods):
for marker in markers:
if getattr(res, marker) is not None:
return period
Expand All @@ -95,7 +75,9 @@ def _populate(cls, res, default):

# Fix weekday
if res.weekday is not None and not res.day:
new_date = new_date + new_relativedelta(weekday=res.weekday)
new_date = new_date + relativedelta(weekday=res.weekday)
if new_date > datetime.utcnow():
new_date -= relativedelta(days=7)

# Correct date and return
return cls._correct(new_date, [key + 's' for key in repl.keys()], default)
Expand Down Expand Up @@ -177,14 +159,14 @@ def dateutil_parse(date_string, **kwargs):
# https://bugs.launchpad.net/dateutil/+bug/1042851
try:
return new_parser().parse(date_string, **kwargs)
except TypeError, e:
except TypeError as e:
raise ValueError(e, "Invalid date: %s" % date_string)


class DateParser(object):

def parse(self, date_string):
date_string = unicode(date_string)
date_string = six.text_type(date_string)

if not date_string.strip():
raise ValueError("Empty string")
Expand Down
2 changes: 1 addition & 1 deletion dateparser/freshness_date_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _are_all_words_units(self, date_string):

words = filter(lambda x: x if x else False, re.split('\W', date_string))
words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words)
return not bool(words)
return not list(words)

def _parse_time(self, date_string):
"""Attemps to parse time part of date strings like '1 day ago, 2 PM' """
Expand Down
17 changes: 9 additions & 8 deletions dateparser/languages/dictionary.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from itertools import izip_longest
from six.moves import zip_longest
from operator import methodcaller

DATEUTIL_PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"] # Consts used in dateutil.parser._parse
Expand All @@ -21,19 +22,19 @@ def __init__(self, language_info):

if 'skip' in language_info:
skip = map(methodcaller('lower'), language_info['skip'])
dictionary.update(izip_longest(skip, [], fillvalue=None))
dictionary.update(zip_longest(skip, [], fillvalue=None))
if 'pertain' in language_info:
pertain = map(methodcaller('lower'), language_info['pertain'])
dictionary.update(izip_longest(pertain, [], fillvalue=None))
dictionary.update(zip_longest(pertain, [], fillvalue=None))
for word in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
'january', 'february', 'march', 'april', 'may', 'june', 'july',
'august', 'september', 'october', 'november', 'december',
'year', 'month', 'week', 'day', 'hour', 'minute', 'second',
'ago']:
translations = map(methodcaller('lower'), language_info[word])
dictionary.update(izip_longest(translations, [], fillvalue=word))
dictionary.update(izip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))
dictionary.update(izip_longest(map(methodcaller('lower'),
dictionary.update(zip_longest(translations, [], fillvalue=word))
dictionary.update(zip_longest(ALWAYS_KEEP_TOKENS, ALWAYS_KEEP_TOKENS))
dictionary.update(zip_longest(map(methodcaller('lower'),
DATEUTIL_PARSERINFO_KNOWN_TOKENS),
DATEUTIL_PARSERINFO_KNOWN_TOKENS))

Expand Down Expand Up @@ -84,7 +85,7 @@ def _get_split_regex(self):
def _construct_split_regex(self):
known_words_group = u"|".join(map(re.escape, self._get_sorted_words()))
if self._no_word_spacing:
regex = ur"^(.*?)({})(.*)$".format(known_words_group)
regex = r"^(.*?)({})(.*)$".format(known_words_group)
else:
regex = ur"^(.*?(?:\A|\d|_|\W))({})((?:\d|_|\W|\Z).*)$".format(known_words_group)
regex = r"^(.*?(?:\A|\d|_|\W))({})((?:\d|_|\W|\Z).*)$".format(known_words_group)
self._split_regex = re.compile(regex, re.UNICODE | re.IGNORECASE)
12 changes: 6 additions & 6 deletions dateparser/languages/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self, shortname, language_info):
self.shortname = shortname
self.info = language_info.copy()
for simplification in self.info.get('simplifications', []):
key, value = simplification.items()[0]
key, value = list(simplification.items())[0]
if isinstance(value, int):
simplification[key] = str(value)

Expand Down Expand Up @@ -51,15 +51,15 @@ def translate(self, date_string, keep_formatting=False):
if word in dictionary:
words[i] = dictionary[word] or ''

return self._join(filter(bool, words), separator="" if keep_formatting else " ")
return self._join(list(filter(bool, words)), separator="" if keep_formatting else " ")

def _simplify(self, date_string):
date_string = date_string.lower()
for simplification in self.info.get('simplifications', []):
pattern, replacement = simplification.items()[0]
pattern, replacement = list(simplification.items())[0]
if not self.info.get('no_word_spacing', False):
replacement = wrap_replacement_for_regex(replacement, pattern)
pattern = ur'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern
pattern = r'(\A|\d|_|\W)%s(\d|_|\W|\Z)' % pattern
date_string = re.sub(pattern, replacement, date_string, flags=re.IGNORECASE | re.UNICODE).lower()
return date_string

Expand All @@ -83,8 +83,8 @@ def _are_all_words_in_the_dictionary(self, words):

def _split(self, date_string, keep_formatting):
tokens = [date_string]
tokens = self._split_tokens_with_regex(tokens, "(\d+)")
tokens = self._split_tokens_by_known_words(tokens, keep_formatting)
tokens = list(self._split_tokens_with_regex(tokens, "(\d+)"))
tokens = list(self._split_tokens_by_known_words(tokens, keep_formatting))
return tokens

def _split_tokens_with_regex(self, tokens, regex):
Expand Down
7 changes: 4 additions & 3 deletions dateparser/languages/loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
from pkgutil import get_data

import six
from yaml import load as load_yaml

from .language import Language
Expand All @@ -11,7 +12,7 @@ class LanguageDataLoader(object):
_data = None

def __init__(self, file=None):
if isinstance(file, basestring):
if isinstance(file, six.string_types):
file = open(file)
self.file = file

Expand Down Expand Up @@ -39,15 +40,15 @@ def _load_data(self):
base_data = data.pop('base', {'skip': []})
base_data['skip'] += settings.SKIP_TOKENS
known_languages = {}
for shortname, language_info in data.iteritems():
for shortname, language_info in six.iteritems(data):
self._update_language_info_with_base_info(language_info, base_data)
language = Language(shortname, language_info)
if language.validate_info():
known_languages[shortname] = language
self._data = known_languages

def _update_language_info_with_base_info(self, language_info, base_info):
for key, values in base_info.iteritems():
for key, values in six.iteritems(base_info):
if isinstance(values, list):
extended_values = (values + language_info[key]) if key in language_info else values
language_info[key] = extended_values
Expand Down
25 changes: 13 additions & 12 deletions dateparser/languages/validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import re
import six
from dateparser.utils import get_logger


Expand Down Expand Up @@ -54,7 +55,7 @@ def _validate_type(cls, language_id, info):
def _validate_name(cls, language_id, info):
result = True

if 'name' not in info or not isinstance(info['name'], basestring) or not info['name']:
if 'name' not in info or not isinstance(info['name'], six.string_types) or not info['name']:
cls.get_logger().error("Language '%(id)s' does not have a name", {'id': language_id})
result = False

Expand Down Expand Up @@ -86,7 +87,7 @@ def _validate_skip_list(cls, language_id, info):
skip_tokens_list = info['skip']
if isinstance(skip_tokens_list, list):
for token in skip_tokens_list:
if not isinstance(token, basestring) or not token:
if not isinstance(token, six.string_types) or not token:
cls.get_logger().error(
"Invalid 'skip' token %(token)r for '%(id)s' language: "
"expected not empty string",
Expand All @@ -111,7 +112,7 @@ def _validate_pertain_list(cls, language_id, info):
pertain_tokens_list = info['skip']
if isinstance(pertain_tokens_list, list):
for token in pertain_tokens_list:
if not isinstance(token, basestring) or not token:
if not isinstance(token, six.string_types) or not token:
cls.get_logger().error(
"Invalid 'pertain' token %(token)r for '%(id)s' language: "
"expected not empty string",
Expand Down Expand Up @@ -141,7 +142,7 @@ def _validate_weekdays(cls, language_id, info):
translations_list = info[weekday]
if isinstance(translations_list, list):
for token in translations_list:
if not isinstance(token, basestring) or not token:
if not isinstance(token, six.string_types) or not token:
cls.get_logger().error(
"Invalid '%(weekday)s' translation %(token)r for '%(id)s' language: "
"expected not empty string",
Expand Down Expand Up @@ -174,7 +175,7 @@ def _validate_months(cls, language_id, info):
translations_list = info[month]
if isinstance(translations_list, list):
for token in translations_list:
if not isinstance(token, basestring) or not token:
if not isinstance(token, six.string_types) or not token:
cls.get_logger().error(
"Invalid '%(month)s' translation %(token)r for '%(id)s' language: "
"expected not empty string",
Expand Down Expand Up @@ -204,7 +205,7 @@ def _validate_units(cls, language_id, info):
translations_list = info[unit]
if isinstance(translations_list, list):
for token in translations_list:
if not isinstance(token, basestring) or not token:
if not isinstance(token, six.string_types) or not token:
cls.get_logger().error(
"Invalid '%(unit)s' translation %(token)r for '%(id)s' language: "
"expected not empty string",
Expand Down Expand Up @@ -234,7 +235,7 @@ def _validate_other_words(cls, language_id, info):
translations_list = info[word]
if isinstance(translations_list, list):
for token in translations_list:
if not isinstance(token, basestring) or not token:
if not isinstance(token, six.string_types) or not token:
cls.get_logger().error(
"Invalid '%(word)s' translation %(token)r for '%(id)s' language: "
"expected not empty string",
Expand Down Expand Up @@ -267,8 +268,8 @@ def _validate_simplifications(cls, language_id, info):
result = False
continue

key, value = simplification.items()[0]
if not isinstance(key, basestring) or not isinstance(value, (basestring, int)):
key, value = list(simplification.items())[0]
if not isinstance(key, six.string_types) or not isinstance(value, (six.string_types, int)):
cls.get_logger().error(
"Invalid simplification %(simplification)r for '%(id)s' language: "
"each simplification suppose to be string-to-string-or-int mapping",
Expand All @@ -277,7 +278,7 @@ def _validate_simplifications(cls, language_id, info):
continue

compiled_key = re.compile(key)
value = unicode(value)
value = six.text_type(value)
replacements = re.findall(r'\\(\d+)', value)
replacements.extend(re.findall(r'\\g<(.+?)>', value))

Expand Down Expand Up @@ -308,7 +309,7 @@ def _validate_simplifications(cls, language_id, info):
"unknown groups %(groups)s",
{'simplification': simplification,
'id': language_id,
'groups': ", ".join(map(unicode, sorted(extra_groups)))})
'groups': ", ".join(map(six.text_type, sorted(extra_groups)))})
result = False

if not_used_groups:
Expand All @@ -317,7 +318,7 @@ def _validate_simplifications(cls, language_id, info):
"groups %(groups)s were not used",
{'simplification': simplification,
'id': language_id,
'groups': ", ".join(map(unicode, sorted(not_used_groups)))})
'groups': ", ".join(map(six.text_type, sorted(not_used_groups)))})
result = False
else:
cls.get_logger().error(
Expand Down
6 changes: 4 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def tearDown(self):
for patch in reversed(self.__patches):
patch.stop()

def then_error_was_raised(self, error_cls, error_message):
def then_error_was_raised(self, error_cls, allowed_substrings=()):
self.assertIsInstance(self.error, error_cls)
self.assertEqual(error_message, str(self.error))
self.assertTrue(any(mesg in str(self.error) for mesg in allowed_substrings),
"Didn't found any of the expected messages (%r) -- message was: %r" % (
allowed_substrings, self.error))
Loading

0 comments on commit d360343

Please sign in to comment.