Merge pull request #332 from akx/improved-js

Improved JavaScript extraction
python-babel · Mar 9, 2016 · 124294a · 124294a
2 parents f5bd94d + 5b09b64
commit 124294a
Show file tree

Hide file tree

Showing 5 changed files with 245 additions and 120 deletions.
diff --git a/babel/messages/extract.py b/babel/messages/extract.py
@@ -506,8 +506,12 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
     :param comment_tags: a list of translator tags to search for and include
                          in the results
     :param options: a dictionary of additional options (optional)
+                    Supported options are:
+                    * `jsx` -- set to false to disable JSX/E4X support.
+                    * `template_string` -- set to false to disable ES6
+                                           template string support.
     """
-    from babel.messages.jslexer import tokenize, unquote_string
+    from babel.messages.jslexer import Token, tokenize, unquote_string
     funcname = message_lineno = None
     messages = []
     last_argument = None
@@ -516,8 +520,24 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
     encoding = options.get('encoding', 'utf-8')
     last_token = None
     call_stack = -1
+    dotted = any('.' in kw for kw in keywords)
+
+    for token in tokenize(
+        fileobj.read().decode(encoding),
+        jsx=options.get("jsx", True),
+        template_string=options.get("template_string", True),
+        dotted=dotted
+    ):
+        if (  # Turn keyword`foo` expressions into keyword("foo") calls:
+            funcname and  # have a keyword...
+            (last_token and last_token.type == 'name') and  # we've seen nothing after the keyword...
+            token.type == 'template_string'  # this is a template string
+        ):
+            message_lineno = token.lineno
+            messages = [unquote_string(token.value)]
+            call_stack = 0
+            token = Token('operator', ')', token.lineno)
 
-    for token in tokenize(fileobj.read().decode(encoding)):
         if token.type == 'operator' and token.value == '(':
             if funcname:
                 message_lineno = token.lineno
@@ -577,7 +597,7 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
                 messages = []
                 call_stack = -1
 
-            elif token.type == 'string':
+            elif token.type in ('string', 'template_string'):
                 new_value = unquote_string(token.value)
                 if concatenate_next:
                     last_argument = (last_argument or '') + new_value

diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py
@@ -9,57 +9,70 @@
     :copyright: (c) 2013 by the Babel Team.
     :license: BSD, see LICENSE for more details.
 """
-
-from operator import itemgetter
+from collections import namedtuple
 import re
 from babel._compat import unichr
 
-operators = [
+operators = sorted([
     '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
     '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
     '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
     '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
-]
-operators.sort(key=lambda a: -len(a))
+], key=len, reverse=True)
 
 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
 
-rules = [
+name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
+dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
+division_re = re.compile(r'/=?')
+regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
+line_re = re.compile(r'(\r\n|\n|\r)')
+line_join_re = re.compile(r'\\' + line_re.pattern)
+uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
+
+Token = namedtuple('Token', 'type value lineno')
+
+_rules = [
     (None, re.compile(r'\s+(?u)')),
     (None, re.compile(r'<!--.*')),
     ('linecomment', re.compile(r'//.*')),
     ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
-    ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
+    ('dotted_name', dotted_name_re),
+    ('name', name_re),
     ('number', re.compile(r'''(?x)(
         (?:0|[1-9]\d*)
         (\.\d+)?
         ([eE][-+]?\d+)? |
         (0x[a-fA-F0-9]+)
     )''')),
+    ('jsx_tag', re.compile(r'<(?:/?)\w+.+?>', re.I)),  # May be mangled in `get_rules`
     ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
+    ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
     ('string', re.compile(r'''(?xs)(
         '(?:[^'\\]*(?:\\.[^'\\]*)*)'  |
         "(?:[^"\\]*(?:\\.[^"\\]*)*)"
     )'''))
 ]
 
-division_re = re.compile(r'/=?')
-regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
-line_re = re.compile(r'(\r\n|\n|\r)')
-line_join_re = re.compile(r'\\' + line_re.pattern)
-uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
-
-
-class Token(tuple):
-    """Represents a token as returned by `tokenize`."""
-    __slots__ = ()
 
-    def __new__(cls, type, value, lineno):
-        return tuple.__new__(cls, (type, value, lineno))
+def get_rules(jsx, dotted, template_string):
+    """
+    Get a tokenization rule list given the passed syntax options.
 
-    type = property(itemgetter(0))
-    value = property(itemgetter(1))
-    lineno = property(itemgetter(2))
+    Internal to this module.
+    """
+    rules = []
+    for token_type, rule in _rules:
+        if not jsx and token_type and 'jsx' in token_type:
+            continue
+        if not template_string and token_type == 'template_string':
+            continue
+        if token_type == 'dotted_name':
+            if not dotted:
+                continue
+            token_type = 'name'
+        rules.append((token_type, rule))
+    return rules
 
 
 def indicates_division(token):
@@ -73,9 +86,9 @@ def indicates_division(token):
 
 def unquote_string(string):
     """Unquote a string with JavaScript rules.  The string has to start with
-    string delimiters (``'`` or ``"``.)
+    string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
     """
-    assert string and string[0] == string[-1] and string[0] in '"\'', \
+    assert string and string[0] == string[-1] and string[0] in '"\'`', \
         'string provided is not properly delimited'
     string = line_join_re.sub('\\1', string[1:-1])
     result = []
@@ -127,13 +140,19 @@ def unquote_string(string):
     return u''.join(result)
 
 
-def tokenize(source):
-    """Tokenize a JavaScript source.  Returns a generator of tokens.
+def tokenize(source, jsx=True, dotted=True, template_string=True):
+    """
+    Tokenize JavaScript/JSX source.  Returns a generator of tokens.
+
+    :param jsx: Enable (limited) JSX parsing.
+    :param dotted: Read dotted names as single name token.
+    :param template_string: Support ES6 template strings
     """
     may_divide = False
     pos = 0
     lineno = 1
     end = len(source)
+    rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
 
     while pos < end:
         # handle regular rules first

diff --git a/tests/messages/test_extract.py b/tests/messages/test_extract.py
@@ -410,97 +410,6 @@ def test_extract_strip_comment_tags(self):
                           u'a prefix too'], messages[1][2])
 
 
-class ExtractJavaScriptTestCase(unittest.TestCase):
-
-    def test_simple_extract(self):
-        buf = BytesIO(b"""\
-msg1 = _('simple')
-msg2 = gettext('simple')
-msg3 = ngettext('s', 'p', 42)
-        """)
-        messages = \
-            list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
-                                 [], {}))
-
-        self.assertEqual([(1, 'simple', [], None),
-                          (2, 'simple', [], None),
-                          (3, ('s', 'p'), [], None)], messages)
-
-    def test_various_calls(self):
-        buf = BytesIO(b"""\
-msg1 = _(i18n_arg.replace(/"/, '"'))
-msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2)
-msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2)
-msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2)
-msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1))
-msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1))
-msg7 = _(hello.there)
-msg8 = gettext('Rabbit')
-msg9 = dgettext('wiki', model.addPage())
-msg10 = dngettext(domain, 'Page', 'Pages', 3)
-""")
-        messages = \
-            list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [],
-                                 {}))
-        self.assertEqual([(5, (u'bunny', u'bunnies'), [], None),
-                          (8, u'Rabbit', [], None),
-                          (10, (u'Page', u'Pages'), [], None)], messages)
-
-    def test_message_with_line_comment(self):
-        buf = BytesIO(u"""\
-// NOTE: hello
-msg = _('Bonjour à tous')
-""".encode('utf-8'))
-        messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual(u'Bonjour à tous', messages[0][2])
-        self.assertEqual([u'NOTE: hello'], messages[0][3])
-
-    def test_message_with_multiline_comment(self):
-        buf = BytesIO(u"""\
-/* NOTE: hello
-   and bonjour
-     and servus */
-msg = _('Bonjour à tous')
-""".encode('utf-8'))
-        messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual(u'Bonjour à tous', messages[0][2])
-        self.assertEqual([u'NOTE: hello', 'and bonjour', '  and servus'], messages[0][3])
-
-    def test_ignore_function_definitions(self):
-        buf = BytesIO(b"""\
-function gettext(value) {
-    return translations[language][value] || value;
-}""")
-
-        messages = list(extract.extract_javascript(buf, ('gettext',), [], {}))
-        self.assertEqual(messages, [])
-
-    def test_misplaced_comments(self):
-        buf = BytesIO(b"""\
-/* NOTE: this won't show up */
-foo()
-
-/* NOTE: this will */
-msg = _('Something')
-
-// NOTE: this will show up
-// too.
-msg = _('Something else')
-
-// NOTE: but this won't
-bar()
-
-_('no comment here')
-""")
-        messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual(u'Something', messages[0][2])
-        self.assertEqual([u'NOTE: this will'], messages[0][3])
-        self.assertEqual(u'Something else', messages[1][2])
-        self.assertEqual([u'NOTE: this will show up', 'too.'], messages[1][3])
-        self.assertEqual(u'no comment here', messages[2][2])
-        self.assertEqual([], messages[2][3])
-
-
 class ExtractTestCase(unittest.TestCase):
 
     def test_invalid_filter(self):