Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
fix and test bleu_hook.bleu_tokenize (#514)
Browse files Browse the repository at this point in the history
* fix and test bleu_hook.bleu_tokenize

* make the test work in Python2
  • Loading branch information
martinpopel authored and rsepassi committed Jan 12, 2018
1 parent cc43389 commit d9cba5c
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 4 deletions.
9 changes: 5 additions & 4 deletions tensor2tensor/utils/bleu_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def __init__(self):
def _property_chars(prefix):
return ''.join(six.unichr(x) for x in range(sys.maxunicode)
if unicodedata.category(six.unichr(x)).startswith(prefix))
punctuation = self._property_chars('P')
punctuation = _property_chars('P')
self.nondigit_punct_re = re.compile(r'([^\d])([' + punctuation + r'])')
self.punct_nondigit_re = re.compile(r'([' + punctuation + r'])([^\d])')
self.symbol_re = re.compile('([' + _property_chars('S') + '])')
Expand Down Expand Up @@ -183,9 +183,10 @@ def bleu_tokenize(string):
Returns:
a list of tokens
"""
string = UnicodeRegex.nondigit_punct_re.sub(r'\1 \2 ', string)
string = UnicodeRegex.punct_nondigit_re.sub(r' \1 \2', string)
string = UnicodeRegex.symbol_re.sub(r' \1 ', string)
uregex = UnicodeRegex()
string = uregex.nondigit_punct_re.sub(r'\1 \2 ', string)
string = uregex.punct_nondigit_re.sub(r' \1 \2', string)
string = uregex.symbol_re.sub(r' \1 ', string)
return string.split()


Expand Down
4 changes: 4 additions & 0 deletions tensor2tensor/utils/bleu_hook_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,9 @@ def testComputeMultipleNgrams(self):
actual_bleu = 0.3436
self.assertAllClose(bleu, actual_bleu, atol=1e-03)

def testBleuTokenize(self):
self.assertEqual(bleu_hook.bleu_tokenize(u'hi, “there”'), [u'hi', u',', u'“', u'there', u'”'])


if __name__ == '__main__':
tf.test.main()

0 comments on commit d9cba5c

Please sign in to comment.