Skip to content

Commit

Permalink
Merge pull request #13 from termi/es6-unicode-surrogates
Browse files Browse the repository at this point in the history
Unicode surrogate pair support | ClassRange.from fix
  • Loading branch information
jviereck committed Apr 6, 2014
2 parents 942b3af + 2022d02 commit 0ff535f
Show file tree
Hide file tree
Showing 5 changed files with 1,299 additions and 523 deletions.
55 changes: 51 additions & 4 deletions parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@
// CharacterEscape
// CharacterClassEscape

function parse(str) {
function parse(str, flags) {
var hasUnicodeFlag = (flags || "").indexOf("u") !== -1;
var pos = 0;
var lastMatchIdx = 0;
var lastMatchClosed = 0;
Expand All @@ -132,6 +133,24 @@ function parse(str) {
}

function createCharacter(matches) {
if (hasUnicodeFlag){
var _char = matches[0];
var first = _char.charCodeAt(0), second;
if (_char.length === 1 && first >= 0xD800 && first <= 0xDBFF ) {
second = lookahead().charCodeAt(0);
if (second >= 0xDC00 && second <= 0xDFFF) {
// Unicode surrogate pair
pos++;
return addRaw({
type: 'escape',
name: 'codePoint',
value: (((first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000)).toString(16).toUpperCase(),
from: pos - 2,
to: pos
});
}
}
}
return addRaw({
type: 'character',
char: matches[0],
Expand Down Expand Up @@ -531,6 +550,32 @@ function parse(str) {
return parseGroup('(?:', 'ignore', '(', 'normal');
}
}

function parseUnicodeSurrogatePairEscape(firstEscape) {
if (hasUnicodeFlag) {
var first, second;
if (firstEscape.type == 'escape' &&
(first = parseInt(firstEscape.value, 16)) >= 0xD800 && first <= 0xDBFF &&
current('\\') && next('u') ) {
var prevPos = pos;
pos++;
var secondEscape = parseClassEscape();
if (secondEscape.type == 'escape' &&
(second = parseInt(secondEscape.value, 16)) >= 0xDC00 && second <= 0xDFFF) {
// Unicode surrogate pair
firstEscape.to = secondEscape.to;
firstEscape.value = ((first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000).toString(16).toUpperCase();
firstEscape.type = 'escape';
firstEscape.name = 'codePoint';
addRaw(firstEscape);
}
else {
pos = prevPos;
}
}
}
return firstEscape;
}

function parseClassEscape() {
return parseAtomEscape(true);
Expand Down Expand Up @@ -654,7 +699,7 @@ function parse(str) {
return createEscaped('hex', res[1], 1);
} else if (res = matchReg(/^u([0-9a-fA-F]{4})/)) {
// UnicodeEscapeSequence
return createEscaped('unicode', res[1], 1);
return parseUnicodeSurrogatePairEscape(createEscaped('unicode', res[1], 1));
} else if (res = matchReg(/^u\{([0-9a-fA-F]{1,6})\}/)) {
// RegExpUnicodeEscapeSequence (ES6 Unicode code point escape)
return createEscaped('codePoint', res[1], 3);
Expand Down Expand Up @@ -740,7 +785,7 @@ function parse(str) {
}

function parseHelperClassRanges(atom) {
var from = pos, to, res;
var from, to, res;
if (current('-') && !next(']')) {
// ClassAtom - ClassAtom ClassRanges
skip('-');
Expand All @@ -754,6 +799,7 @@ function parse(str) {
if (!classRanges) {
throw syntaxError('classRanges');
}
from = atom.from;
if (classRanges.type === 'empty') {
return [createClassRange(atom, res, from, to)];
}
Expand Down Expand Up @@ -833,7 +879,8 @@ function parse(str) {
if (!res) {
throw syntaxError('classEscape');
}
return res;

return parseUnicodeSurrogatePairEscape(res);
}
}

Expand Down
17 changes: 12 additions & 5 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,24 @@ var fs = require('fs');

var parse = require('../parser').parse;

var parseTests = JSON.parse(fs.readFileSync('test/parse_input.json') || '[]');
var parseResult = JSON.parse(fs.readFileSync('test/parse_output.json') || '[]');
var parseTests = [].concat(
JSON.parse(fs.readFileSync('test/parse_input.json') || '[]').map(function(re){ return {input: re, flags: ''} }),
JSON.parse(fs.readFileSync('test/parse_unicode_input.json', 'utf8') || '[]').map(function(re){ return {input: re, flags: 'u'} })
);
var parseResult = [].concat(
JSON.parse(fs.readFileSync('test/parse_output.json') || '[]'),
JSON.parse(fs.readFileSync('test/parse_unicode_output.json', 'utf8') || '[]')
);

if (parseTests.length !== parseResult.length) {
fail('Parse input and output file needs to have same number of arguments');
}

parseTests.forEach(function(input, idx) {
parseTests.forEach(function(re, idx) {
var input = re.input, flags = re.flags;
var par;
try {
par = parse(input);
par = parse(input, flags);
} catch (error) {
par = {
type: 'error',
Expand All @@ -25,7 +32,7 @@ parseTests.forEach(function(input, idx) {
var resuls = parseResult[idx];

if (JSON.stringify(par) !== JSON.stringify(resuls)) {
throw new Error('Failure parsing string ' + input + ':' + JSON.stringify(par) + '\n' + JSON.stringify(resuls));
throw new Error('Failure parsing string ' + input + (flags ? '(' + flags + ')' : '') + ':' + JSON.stringify(par) + '\n' + JSON.stringify(resuls));
} else {
console.log('PASSED TEST: ' + input);
}
Expand Down
Loading

0 comments on commit 0ff535f

Please sign in to comment.