Skip to content

Commit

Permalink
Merge pull request #332 from akx/improved-js
Browse files Browse the repository at this point in the history
Improved JavaScript extraction
  • Loading branch information
akx committed Mar 9, 2016
2 parents f5bd94d + 5b09b64 commit 124294a
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 120 deletions.
26 changes: 23 additions & 3 deletions babel/messages/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,8 +506,12 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
:param comment_tags: a list of translator tags to search for and include
in the results
:param options: a dictionary of additional options (optional)
Supported options are:
* `jsx` -- set to false to disable JSX/E4X support.
* `template_string` -- set to false to disable ES6
template string support.
"""
from babel.messages.jslexer import tokenize, unquote_string
from babel.messages.jslexer import Token, tokenize, unquote_string
funcname = message_lineno = None
messages = []
last_argument = None
Expand All @@ -516,8 +520,24 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
encoding = options.get('encoding', 'utf-8')
last_token = None
call_stack = -1
dotted = any('.' in kw for kw in keywords)

for token in tokenize(
fileobj.read().decode(encoding),
jsx=options.get("jsx", True),
template_string=options.get("template_string", True),
dotted=dotted
):
if ( # Turn keyword`foo` expressions into keyword("foo") calls:
funcname and # have a keyword...
(last_token and last_token.type == 'name') and # we've seen nothing after the keyword...
token.type == 'template_string' # this is a template string
):
message_lineno = token.lineno
messages = [unquote_string(token.value)]
call_stack = 0
token = Token('operator', ')', token.lineno)

for token in tokenize(fileobj.read().decode(encoding)):
if token.type == 'operator' and token.value == '(':
if funcname:
message_lineno = token.lineno
Expand Down Expand Up @@ -577,7 +597,7 @@ def extract_javascript(fileobj, keywords, comment_tags, options):
messages = []
call_stack = -1

elif token.type == 'string':
elif token.type in ('string', 'template_string'):
new_value = unquote_string(token.value)
if concatenate_next:
last_argument = (last_argument or '') + new_value
Expand Down
71 changes: 45 additions & 26 deletions babel/messages/jslexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,57 +9,70 @@
:copyright: (c) 2013 by the Babel Team.
:license: BSD, see LICENSE for more details.
"""

from operator import itemgetter
from collections import namedtuple
import re
from babel._compat import unichr

operators = [
operators = sorted([
'+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
'+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
'>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
'[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
]
operators.sort(key=lambda a: -len(a))
], key=len, reverse=True)

escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}

rules = [
name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')

Token = namedtuple('Token', 'type value lineno')

_rules = [
(None, re.compile(r'\s+(?u)')),
(None, re.compile(r'<!--.*')),
('linecomment', re.compile(r'//.*')),
('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
('dotted_name', dotted_name_re),
('name', name_re),
('number', re.compile(r'''(?x)(
(?:0|[1-9]\d*)
(\.\d+)?
([eE][-+]?\d+)? |
(0x[a-fA-F0-9]+)
)''')),
('jsx_tag', re.compile(r'<(?:/?)\w+.+?>', re.I)), # May be mangled in `get_rules`
('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
('string', re.compile(r'''(?xs)(
'(?:[^'\\]*(?:\\.[^'\\]*)*)' |
"(?:[^"\\]*(?:\\.[^"\\]*)*)"
)'''))
]

division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')


class Token(tuple):
"""Represents a token as returned by `tokenize`."""
__slots__ = ()

def __new__(cls, type, value, lineno):
return tuple.__new__(cls, (type, value, lineno))
def get_rules(jsx, dotted, template_string):
"""
Get a tokenization rule list given the passed syntax options.
type = property(itemgetter(0))
value = property(itemgetter(1))
lineno = property(itemgetter(2))
Internal to this module.
"""
rules = []
for token_type, rule in _rules:
if not jsx and token_type and 'jsx' in token_type:
continue
if not template_string and token_type == 'template_string':
continue
if token_type == 'dotted_name':
if not dotted:
continue
token_type = 'name'
rules.append((token_type, rule))
return rules


def indicates_division(token):
Expand All @@ -73,9 +86,9 @@ def indicates_division(token):

def unquote_string(string):
"""Unquote a string with JavaScript rules. The string has to start with
string delimiters (``'`` or ``"``.)
string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
"""
assert string and string[0] == string[-1] and string[0] in '"\'', \
assert string and string[0] == string[-1] and string[0] in '"\'`', \
'string provided is not properly delimited'
string = line_join_re.sub('\\1', string[1:-1])
result = []
Expand Down Expand Up @@ -127,13 +140,19 @@ def unquote_string(string):
return u''.join(result)


def tokenize(source):
"""Tokenize a JavaScript source. Returns a generator of tokens.
def tokenize(source, jsx=True, dotted=True, template_string=True):
"""
Tokenize JavaScript/JSX source. Returns a generator of tokens.
:param jsx: Enable (limited) JSX parsing.
:param dotted: Read dotted names as single name token.
:param template_string: Support ES6 template strings
"""
may_divide = False
pos = 0
lineno = 1
end = len(source)
rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)

while pos < end:
# handle regular rules first
Expand Down
91 changes: 0 additions & 91 deletions tests/messages/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,97 +410,6 @@ def test_extract_strip_comment_tags(self):
u'a prefix too'], messages[1][2])


class ExtractJavaScriptTestCase(unittest.TestCase):

def test_simple_extract(self):
buf = BytesIO(b"""\
msg1 = _('simple')
msg2 = gettext('simple')
msg3 = ngettext('s', 'p', 42)
""")
messages = \
list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
[], {}))

self.assertEqual([(1, 'simple', [], None),
(2, 'simple', [], None),
(3, ('s', 'p'), [], None)], messages)

def test_various_calls(self):
buf = BytesIO(b"""\
msg1 = _(i18n_arg.replace(/"/, '"'))
msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2)
msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2)
msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2)
msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1))
msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1))
msg7 = _(hello.there)
msg8 = gettext('Rabbit')
msg9 = dgettext('wiki', model.addPage())
msg10 = dngettext(domain, 'Page', 'Pages', 3)
""")
messages = \
list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [],
{}))
self.assertEqual([(5, (u'bunny', u'bunnies'), [], None),
(8, u'Rabbit', [], None),
(10, (u'Page', u'Pages'), [], None)], messages)

def test_message_with_line_comment(self):
buf = BytesIO(u"""\
// NOTE: hello
msg = _('Bonjour à tous')
""".encode('utf-8'))
messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
self.assertEqual(u'Bonjour à tous', messages[0][2])
self.assertEqual([u'NOTE: hello'], messages[0][3])

def test_message_with_multiline_comment(self):
buf = BytesIO(u"""\
/* NOTE: hello
and bonjour
and servus */
msg = _('Bonjour à tous')
""".encode('utf-8'))
messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
self.assertEqual(u'Bonjour à tous', messages[0][2])
self.assertEqual([u'NOTE: hello', 'and bonjour', ' and servus'], messages[0][3])

def test_ignore_function_definitions(self):
buf = BytesIO(b"""\
function gettext(value) {
return translations[language][value] || value;
}""")

messages = list(extract.extract_javascript(buf, ('gettext',), [], {}))
self.assertEqual(messages, [])

def test_misplaced_comments(self):
buf = BytesIO(b"""\
/* NOTE: this won't show up */
foo()
/* NOTE: this will */
msg = _('Something')
// NOTE: this will show up
// too.
msg = _('Something else')
// NOTE: but this won't
bar()
_('no comment here')
""")
messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
self.assertEqual(u'Something', messages[0][2])
self.assertEqual([u'NOTE: this will'], messages[0][3])
self.assertEqual(u'Something else', messages[1][2])
self.assertEqual([u'NOTE: this will show up', 'too.'], messages[1][3])
self.assertEqual(u'no comment here', messages[2][2])
self.assertEqual([], messages[2][3])


class ExtractTestCase(unittest.TestCase):

def test_invalid_filter(self):
Expand Down
Loading

0 comments on commit 124294a

Please sign in to comment.