From d3f3b720849023e5c00ed5ddf559a710d268a9f2 Mon Sep 17 00:00:00 2001 From: mikesamuel Date: Fri, 4 Jul 2008 21:55:31 +0000 Subject: [PATCH] Separate public API from private API in preparation for creating a language handler registry --- src/prettify.js | 1809 ++++++++++++++++++++++++----------------------- 1 file changed, 925 insertions(+), 884 deletions(-) diff --git a/src/prettify.js b/src/prettify.js index 80a58c14..99aa9cc8 100644 --- a/src/prettify.js +++ b/src/prettify.js @@ -43,76 +43,34 @@ * Java annotations (start with "@") are now captured as literals ("lit") */ -var PR_keywords = {}; -/** initialize the keyword list for our target languages. */ -(function () { - var CPP_KEYWORDS = "abstract bool break case catch char class const " + - "const_cast continue default delete deprecated dllexport dllimport do " + - "double dynamic_cast else enum explicit extern false float for friend " + - "goto if inline int long mutable naked namespace new noinline noreturn " + - "nothrow novtable operator private property protected public register " + - "reinterpret_cast return selectany short signed sizeof static " + - "static_cast struct switch template this thread throw true try typedef " + - "typeid typename union unsigned using declaration, directive uuid " + - "virtual void volatile while typeof"; - var CSHARP_KEYWORDS = "as base by byte checked decimal delegate descending " + - "event finally fixed foreach from group implicit in interface internal " + - "into is lock null object out override orderby params readonly ref sbyte " + - "sealed stackalloc string select uint ulong unchecked unsafe ushort var"; - var JAVA_KEYWORDS = "package synchronized boolean implements import throws " + - "instanceof transient extends final strictfp native super"; - var JSCRIPT_KEYWORDS = "debugger export function with NaN Infinity"; - var PERL_KEYWORDS = "require sub unless until use elsif BEGIN END"; - var PYTHON_KEYWORDS = "and assert def del elif except exec global lambda " + - "not or pass print raise yield False True None"; - var RUBY_KEYWORDS = "then end begin rescue ensure module when undef next " + - "redo retry alias defined"; - var SH_KEYWORDS = "done fi"; - - var KEYWORDS = [CPP_KEYWORDS, CSHARP_KEYWORDS, JAVA_KEYWORDS, - JSCRIPT_KEYWORDS, PERL_KEYWORDS, PYTHON_KEYWORDS, - RUBY_KEYWORDS, SH_KEYWORDS]; - for (var k = 0; k < KEYWORDS.length; k++) { - var kw = KEYWORDS[k].split(' '); - for (var i = 0; i < kw.length; i++) { - if (kw[i]) { PR_keywords[kw[i]] = true; } - } - } -}).call(this); - -// token style names. correspond to css classes -/** token style for a string literal */ -var PR_STRING = 'str'; -/** token style for a keyword */ -var PR_KEYWORD = 'kwd'; -/** token style for a comment */ -var PR_COMMENT = 'com'; -/** token style for a type */ -var PR_TYPE = 'typ'; -/** token style for a literal value. e.g. 1, null, true. */ -var PR_LITERAL = 'lit'; -/** token style for a punctuation string. */ -var PR_PUNCTUATION = 'pun'; -/** token style for a punctuation string. */ -var PR_PLAIN = 'pln'; - -/** token style for an sgml tag. */ -var PR_TAG = 'tag'; -/** token style for a markup declaration such as a DOCTYPE. */ -var PR_DECLARATION = 'dec'; -/** token style for embedded source. */ -var PR_SOURCE = 'src'; -/** token style for an sgml attribute name. */ -var PR_ATTRIB_NAME = 'atn'; -/** token style for an sgml attribute value. */ -var PR_ATTRIB_VALUE = 'atv'; +/** + * Split {@code prettyPrint} into multiple timeouts so as not to interfere with + * UI events. + * If set to {@code false}, {@code prettyPrint()} is synchronous. + */ +var PR_SHOULD_USE_CONTINUATION = true; /** the number of characters between tab columns */ var PR_TAB_WIDTH = 8; -function PR_isWordChar(ch) { - return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); -} +/** Walks the DOM returning a properly escaped version of innerHTML. + * @param {Node} node + * @param {Array.} out output buffer that receives chunks of HTML. + */ +var PR_normalizedHtml; + +/** Pretty print a chunk of code. + * + * @param {string} sourceCodeHtml code as html + * @return {string} code as html, but prettier + */ +var prettyPrintOne; +/** find all the < pre > and < code > tags in the DOM with class=prettyprint + * and prettify them. + * @param {Function} opt_whenDone if specified, called when the last entry + * has been finished. + */ +var prettyPrint; function PR_isIE6() { var isIE6 = navigator && navigator.userAgent @@ -121,910 +79,993 @@ function PR_isIE6() { return isIE6; } -/** Splice one array into another. - * Like the python - * container[containerPosition:containerPosition + countReplaced] = inserted - * - * @param {Array} inserted - * @param {Array} container modified in place - * @param {Number} containerPosition - * @param {Number} countReplaced - */ -function PR_spliceArrayInto( - inserted, container, containerPosition, countReplaced) { - inserted.unshift(containerPosition, countReplaced || 0); - try { - container.splice.apply(container, inserted); - } finally { - inserted.splice(0, 2); - } -} -/** a set of tokens that can precede a regular expression literal in javascript. - * http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full - * list, but I've removed ones that might be problematic when seen in languages - * that don't support regular expression literals. - * - *

Specifically, I've removed any keywords that can't precede a regexp - * literal in a syntactically legal javascript program, and I've removed the - * "in" keyword since it's not a keyword in many languages, and might be used - * as a count of inches. - * @private - */ -var REGEXP_PRECEDER_PATTERN = (function () { - var preceders = [ - "!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=", - "&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=", - "->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";", - "<", "<<", "<<=", "<=", "=", "==", "===", ">", - ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", - "^", "^=", "^^", "^^=", "{", "|", "|=", "||", - "||=", "~", "break", "case", "continue", "delete", - "do", "else", "finally", "instanceof", - "return", "throw", "try", "typeof" - ]; - var pattern = '(?:' + - '(?:(?:^|[^0-9\.])\\.{1,3})|' + // a dot that's not part of a number - '(?:(?:^|[^\\+])\\+)|' + // allow + but not ++ - '(?:(?:^|[^\\-])-)' // allow - but not -- - ; - for (var i = 0; i < preceders.length; ++i) { - var preceder = preceders[i]; - if (PR_isWordChar(preceder.charAt(0))) { - pattern += '|\\b' + preceder; - } else { - pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1'); +(function () { + var PR_keywords = {}; + /** initialize the keyword list for our target languages. */ + (function () { + var CPP_KEYWORDS = "abstract bool break case catch char class const " + + "const_cast continue default delete deprecated dllexport dllimport do " + + "double dynamic_cast else enum explicit extern false float for friend " + + "goto if inline int long mutable naked namespace new noinline noreturn " + + "nothrow novtable operator private property protected public register " + + "reinterpret_cast return selectany short signed sizeof static " + + "static_cast struct switch template this thread throw true try typedef " + + "typeid typename union unsigned using declaration directive uuid " + + "virtual void volatile while typeof"; + var CSHARP_KEYWORDS = "as base by byte checked decimal delegate " + + "descending event finally fixed foreach from group implicit in " + + "interface internal into is lock null object out override orderby " + + "params readonly ref sbyte sealed stackalloc string select uint ulong " + + "unchecked unsafe ushort var"; + var JAVA_KEYWORDS = "package synchronized boolean implements import " + + "throws instanceof transient extends final strictfp native super"; + var JSCRIPT_KEYWORDS = "debugger export function with NaN Infinity"; + var PERL_KEYWORDS = "require sub unless until use elsif BEGIN END"; + var PYTHON_KEYWORDS = "and assert def del elif except exec global lambda " + + "not or pass print raise yield False True None"; + var RUBY_KEYWORDS = "then end begin rescue ensure module when undef next " + + "redo retry alias defined"; + var SH_KEYWORDS = "done esac fi"; + + var KEYWORDS = [CPP_KEYWORDS, CSHARP_KEYWORDS, JAVA_KEYWORDS, + JSCRIPT_KEYWORDS, PERL_KEYWORDS, PYTHON_KEYWORDS, + RUBY_KEYWORDS, SH_KEYWORDS]; + for (var k = 0; k < KEYWORDS.length; k++) { + var kw = KEYWORDS[k].split(' '); + for (var i = 0; i < kw.length; i++) { + if (kw[i]) { PR_keywords[kw[i]] = true; } } } - pattern += '|^)\\s*$'; // matches at end, and matches empty string - return new RegExp(pattern); - // CAVEAT: this does not properly handle the case where a regular expression - // immediately follows another since a regular expression may have flags - // for case-sensitivity and the like. Having regexp tokens adjacent is not - // valid in any language I'm aware of, so I'm punting. - // TODO: maybe style special characters inside a regexp as punctuation. - })(); - -// Define regexps here so that the interpreter doesn't have to create an object -// each time the function containing them is called. -// The language spec requires a new object created even if you don't access the -// $1 members. -var pr_amp = /&/g; -var pr_lt = //g; -var pr_quot = /\"/g; -/** like textToHtml but escapes double quotes to be attribute safe. */ -function PR_attribToHtml(str) { - return str.replace(pr_amp, '&') - .replace(pr_lt, '<') - .replace(pr_gt, '>') - .replace(pr_quot, '"'); -} - -/** escapest html special characters to html. */ -function PR_textToHtml(str) { - return str.replace(pr_amp, '&') - .replace(pr_lt, '<') - .replace(pr_gt, '>'); -} + }).call(this); + + // token style names. correspond to css classes + /** token style for a string literal */ + var PR_STRING = 'str'; + /** token style for a keyword */ + var PR_KEYWORD = 'kwd'; + /** token style for a comment */ + var PR_COMMENT = 'com'; + /** token style for a type */ + var PR_TYPE = 'typ'; + /** token style for a literal value. e.g. 1, null, true. */ + var PR_LITERAL = 'lit'; + /** token style for a punctuation string. */ + var PR_PUNCTUATION = 'pun'; + /** token style for a punctuation string. */ + var PR_PLAIN = 'pln'; + + /** token style for an sgml tag. */ + var PR_TAG = 'tag'; + /** token style for a markup declaration such as a DOCTYPE. */ + var PR_DECLARATION = 'dec'; + /** token style for embedded source. */ + var PR_SOURCE = 'src'; + /** token style for an sgml attribute name. */ + var PR_ATTRIB_NAME = 'atn'; + /** token style for an sgml attribute value. */ + var PR_ATTRIB_VALUE = 'atv'; + + function PR_isWordChar(ch) { + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); + } + /** Splice one array into another. + * Like the python + * container[containerPosition:containerPosition + countReplaced] = inserted + * + * @param {Array} inserted + * @param {Array} container modified in place + * @param {Number} containerPosition + * @param {Number} countReplaced + */ + function PR_spliceArrayInto( + inserted, container, containerPosition, countReplaced) { + inserted.unshift(containerPosition, countReplaced || 0); + try { + container.splice.apply(container, inserted); + } finally { + inserted.splice(0, 2); + } + } -var pr_ltEnt = /</g; -var pr_gtEnt = />/g; -var pr_aposEnt = /'/g; -var pr_quotEnt = /"/g; -var pr_ampEnt = /&/g; -/** unescapes html to plain text. */ -function PR_htmlToText(html) { - var pos = html.indexOf('&'); - if (pos < 0) { return html; } - // Handle numeric entities specially. We can't use functional substitution - // since that doesn't work in older versions of Safari. - // These should be rare since most browsers convert them to normal chars. - for (--pos; (pos = html.indexOf('&#', pos + 1)) >= 0;) { - var end = html.indexOf(';', pos); - if (end >= 0) { - var num = html.substring(pos + 3, end); - var radix = 10; - if (num && num.charAt(0) == 'x') { - num = num.substring(1); - radix = 16; + /** A set of tokens that can precede a regular expression literal in + * javascript. + * http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full + * list, but I've removed ones that might be problematic when seen in + * languages that don't support regular expression literals. + * + *

Specifically, I've removed any keywords that can't precede a regexp + * literal in a syntactically legal javascript program, and I've removed the + * "in" keyword since it's not a keyword in many languages, and might be used + * as a count of inches. + * @private + */ + var REGEXP_PRECEDER_PATTERN = (function () { + var preceders = [ + "!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=", + "&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=", + "->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";", + "<", "<<", "<<=", "<=", "=", "==", "===", ">", + ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", + "^", "^=", "^^", "^^=", "{", "|", "|=", "||", + "||=", "~", "break", "case", "continue", "delete", + "do", "else", "finally", "instanceof", + "return", "throw", "try", "typeof" + ]; + var pattern = '(?:' + + '(?:(?:^|[^0-9\.])\\.{1,3})|' + // a dot that's not part of a number + '(?:(?:^|[^\\+])\\+)|' + // allow + but not ++ + '(?:(?:^|[^\\-])-)' // allow - but not -- + ; + for (var i = 0; i < preceders.length; ++i) { + var preceder = preceders[i]; + if (PR_isWordChar(preceder.charAt(0))) { + pattern += '|\\b' + preceder; + } else { + pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1'); + } } - var codePoint = parseInt(num, radix); - if (!isNaN(codePoint)) { - html = (html.substring(0, pos) + String.fromCharCode(codePoint) + - html.substring(end + 1)); + pattern += '|^)\\s*$'; // matches at end, and matches empty string + return new RegExp(pattern); + // CAVEAT: this does not properly handle the case where a regular + // expression immediately follows another since a regular expression may + // have flags for case-sensitivity and the like. Having regexp tokens + // adjacent is not + // valid in any language I'm aware of, so I'm punting. + // TODO: maybe style special characters inside a regexp as punctuation. + })(); + + // Define regexps here so that the interpreter doesn't have to create an + // object each time the function containing them is called. + // The language spec requires a new object created even if you don't access + // the $1 members. + var pr_amp = /&/g; + var pr_lt = //g; + var pr_quot = /\"/g; + /** like textToHtml but escapes double quotes to be attribute safe. */ + function PR_attribToHtml(str) { + return str.replace(pr_amp, '&') + .replace(pr_lt, '<') + .replace(pr_gt, '>') + .replace(pr_quot, '"'); + } + + /** escapest html special characters to html. */ + function PR_textToHtml(str) { + return str.replace(pr_amp, '&') + .replace(pr_lt, '<') + .replace(pr_gt, '>'); + } + + + var pr_ltEnt = /</g; + var pr_gtEnt = />/g; + var pr_aposEnt = /'/g; + var pr_quotEnt = /"/g; + var pr_ampEnt = /&/g; + /** unescapes html to plain text. */ + function PR_htmlToText(html) { + var pos = html.indexOf('&'); + if (pos < 0) { return html; } + // Handle numeric entities specially. We can't use functional substitution + // since that doesn't work in older versions of Safari. + // These should be rare since most browsers convert them to normal chars. + for (--pos; (pos = html.indexOf('&#', pos + 1)) >= 0;) { + var end = html.indexOf(';', pos); + if (end >= 0) { + var num = html.substring(pos + 3, end); + var radix = 10; + if (num && num.charAt(0) == 'x') { + num = num.substring(1); + radix = 16; + } + var codePoint = parseInt(num, radix); + if (!isNaN(codePoint)) { + html = (html.substring(0, pos) + String.fromCharCode(codePoint) + + html.substring(end + 1)); + } } } + + return html.replace(pr_ltEnt, '<') + .replace(pr_gtEnt, '>') + .replace(pr_aposEnt, "'") + .replace(pr_quotEnt, '"') + .replace(pr_ampEnt, '&'); } - return html.replace(pr_ltEnt, '<') - .replace(pr_gtEnt, '>') - .replace(pr_aposEnt, "'") - .replace(pr_quotEnt, '"') - .replace(pr_ampEnt, '&'); -} + /** is the given node's innerHTML normally unescaped? */ + function PR_isRawContent(node) { + return 'XMP' == node.tagName; + } -/** is the given node's innerHTML normally unescaped? */ -function PR_isRawContent(node) { - return 'XMP' == node.tagName; -} + var PR_innerHtmlWorks = null; + function PR_getInnerHtml(node) { + // inner html is hopelessly broken in Safari 2.0.4 when the content is + // an html description of well formed XML and the containing tag is a PRE + // tag, so we detect that case and emulate innerHTML. + if (null === PR_innerHtmlWorks) { + var testNode = document.createElement('PRE'); + testNode.appendChild( + document.createTextNode('\n')); + PR_innerHtmlWorks = !/\n')); - PR_innerHtmlWorks = !/'); + for (var child = node.firstChild; child; child = child.nextSibling) { + PR_normalizedHtml(child, out); + } + if (node.firstChild || !/^(?:br|link|img)$/.test(name)) { + out.push('<\/', name, '>'); + } + break; + case 2: // an attribute + out.push(node.name.toLowerCase(), + '="', PR_attribToHtml(node.value), '"'); + break; + case 3: case 4: // text + out.push(PR_textToHtml(node.nodeValue)); + break; + } } - return out.join(''); -} -/** walks the DOM returning a properly escaped version of innerHTML. - */ -function PR_normalizedHtml(node, out) { - switch (node.nodeType) { - case 1: // an element - var name = node.tagName.toLowerCase(); - out.push('\074', name); - for (var i = 0; i < node.attributes.length; ++i) { - var attr = node.attributes[i]; - if (!attr.specified) { continue; } - out.push(' '); - PR_normalizedHtml(attr, out); - } - out.push('>'); - for (var child = node.firstChild; child; child = child.nextSibling) { - PR_normalizedHtml(child, out); - } - if (node.firstChild || !/^(?:br|link|img)$/.test(name)) { - out.push('<\/', name, '>'); + /** returns a function that expand tabs to spaces. This function can be fed + * successive chunks of text, and will maintain its own internal state to + * keep track of how tabs are expanded. + * @return {function (plainText : string) : string} a function that takes + * plain text and return the text with tabs expanded. + * @private + */ + function PR_tabExpander(tabWidth) { + var SPACES = ' '; + var charInLine = 0; + + return function (plainText) { + // walk over each character looking for tabs and newlines. + // On tabs, expand them. On newlines, reset charInLine. + // Otherwise increment charInLine + var out = null; + var pos = 0; + for (var i = 0, n = plainText.length; i < n; ++i) { + var ch = plainText.charAt(i); + + switch (ch) { + case '\t': + if (!out) { out = []; } + out.push(plainText.substring(pos, i)); + // calculate how much space we need in front of this part + // nSpaces is the amount of padding -- the number of spaces needed + // to move us to the next column, where columns occur at factors of + // tabWidth. + var nSpaces = tabWidth - (charInLine % tabWidth); + charInLine += nSpaces; + for (; nSpaces >= 0; nSpaces -= SPACES.length) { + out.push(SPACES.substring(0, nSpaces)); + } + pos = i + 1; + break; + case '\n': + charInLine = 0; + break; + default: + ++charInLine; + } } - break; - case 2: // an attribute - out.push(node.name.toLowerCase(), '="', PR_attribToHtml(node.value), '"'); - break; - case 3: case 4: // text - out.push(PR_textToHtml(node.nodeValue)); - break; + if (!out) { return plainText; } + out.push(plainText.substring(pos)); + return out.join(''); + }; } -} - -/** returns a function that expand tabs to spaces. This function can be fed - * successive chunks of text, and will maintain its own internal state to - * keep track of how tabs are expanded. - * @return {function (plainText : String) : String } a function that takes - * plain text and return the text with tabs expanded. - * @private - */ -function PR_tabExpander(tabWidth) { - var SPACES = ' '; - var charInLine = 0; - - return function (plainText) { - // walk over each character looking for tabs and newlines. - // On tabs, expand them. On newlines, reset charInLine. - // Otherwise increment charInLine - var out = null; - var pos = 0; - for (var i = 0, n = plainText.length; i < n; ++i) { - var ch = plainText.charAt(i); - - switch (ch) { - case '\t': - if (!out) { out = []; } - out.push(plainText.substring(pos, i)); - // calculate how much space we need in front of this part - // nSpaces is the amount of padding -- the number of spaces needed to - // move us to the next column, where columns occur at factors of - // tabWidth. - var nSpaces = tabWidth - (charInLine % tabWidth); - charInLine += nSpaces; - for (; nSpaces >= 0; nSpaces -= SPACES.length) { - out.push(SPACES.substring(0, nSpaces)); - } - pos = i + 1; - break; - case '\n': - charInLine = 0; - break; - default: - ++charInLine; - } - } - if (!out) { return plainText; } - out.push(plainText.substring(pos)); - return out.join(''); - }; -} -// The below pattern matches one of the following -// (1) /[^<]+/ : A run of characters other than '<' -// (2) //: an HTML comment -// (3) //: a cdata section -// (3) /<\/?[a-zA-Z][^>]*>/ : A probably tag that should not be highlighted -// (4) //: an HTML comment + // (3) //: a cdata section + // (3) /<\/?[a-zA-Z][^>]*>/ : A probably tag that should not be highlighted + // (4) /||<\/?[a-zA-Z][^>]*>|<)/g; -var pr_commentPrefix = /^|$)/, null], - [PR_SOURCE, /^<\?[\s\S]*?(?:\?>|$)/, null], - [PR_SOURCE, /^<%[\s\S]*?(?:%>|$)/, null], - [PR_SOURCE, - // Tags whose content is not escaped, and which contain source code. - /^<(script|style|xmp)\b[^>]*>[\s\S]*?<\/\1\b[^>]*>/i, null], - [PR_TAG, /^<\/?\w[^<>]*>/, null] - ]); -// Splits any of the source|style|xmp entries above into a start tag, -// source content, and end tag. -var PR_SOURCE_CHUNK_PARTS = /^(<[^>]*>)([\s\S]*)(<\/[^>]*>)$/; -/** split markup on tags, comments, application directives, and other top level - * constructs. Tags are returned as a single token - attributes are not yet - * broken out. - * @private - */ -function PR_tokenizeMarkup(source) { - var decorations = PR_MARKUP_LEXER(source); - for (var i = 0; i < decorations.length; i += 2) { - if (decorations[i + 1] === PR_SOURCE) { - var start = decorations[i]; - var end = i + 2 < decorations.length ? decorations[i + 2] : source.length; - // Split out start and end script tags as actual tags, and leave the body - // with style SCRIPT. - var sourceChunk = source.substring(start, end); - var match = (sourceChunk.match(PR_SOURCE_CHUNK_PARTS) - //|| sourceChunk.match(/^(<[?%])([\s\S]*)([?%]>)$/) - ); - if (match) { - decorations.splice( - i, 2, - start, PR_TAG, // the open chunk - start + match[1].length, PR_SOURCE, - start + match[1].length + (match[2] || '').length, PR_TAG); + var PR_MARKUP_LEXER = PR_createSimpleLexer([], [ + [PR_PLAIN, /^[^<]+/, null], + [PR_DECLARATION, /^]*(?:>|$)/, null], + [PR_COMMENT, /^|$)/, null], + [PR_SOURCE, /^<\?[\s\S]*?(?:\?>|$)/, null], + [PR_SOURCE, /^<%[\s\S]*?(?:%>|$)/, null], + [PR_SOURCE, + // Tags whose content is not escaped, and which contain source code. + /^<(script|style|xmp)\b[^>]*>[\s\S]*?<\/\1\b[^>]*>/i, null], + [PR_TAG, /^<\/?\w[^<>]*>/, null] + ]); + // Splits any of the source|style|xmp entries above into a start tag, + // source content, and end tag. + var PR_SOURCE_CHUNK_PARTS = /^(<[^>]*>)([\s\S]*)(<\/[^>]*>)$/; + /** split markup on tags, comments, application directives, and other top + * level constructs. Tags are returned as a single token - attributes are + * not yet broken out. + * @private + */ + function PR_tokenizeMarkup(source) { + var decorations = PR_MARKUP_LEXER(source); + for (var i = 0; i < decorations.length; i += 2) { + if (decorations[i + 1] === PR_SOURCE) { + var start = decorations[i]; + var end = i + 2 < decorations.length + ? decorations[i + 2] + : source.length; + // Split out start and end script tags as actual tags, and leave the + // body with style SCRIPT. + var sourceChunk = source.substring(start, end); + var match = (sourceChunk.match(PR_SOURCE_CHUNK_PARTS) + //|| sourceChunk.match(/^(<[?%])([\s\S]*)([?%]>)$/) + ); + if (match) { + decorations.splice( + i, 2, + start, PR_TAG, // the open chunk + start + match[1].length, PR_SOURCE, + start + match[1].length + (match[2] || '').length, PR_TAG); + } } } + return decorations; } - return decorations; -} -var PR_TAG_LEXER = PR_createSimpleLexer([ - [PR_ATTRIB_VALUE, /^\'[^\']*(?:\'|$)/, null, "'"], - [PR_ATTRIB_VALUE, /^\"[^\"]*(?:\"|$)/, null, '"'], - [PR_PUNCTUATION, /^[<>\/=]+/, null, '<>/='] - ], [ - [PR_TAG, /^[\w:-]+/, /^\/=]+/, null, '<>/='] + ], [ + [PR_TAG, /^[\w:-]+/, /^= 2 && /^[\"\']/.test(attribValue) && - attribValue.charAt(0) === attribValue.charAt(attribLen - 1)); - - var attribSource; - var attribSourceStart; - var attribSourceEnd; - if (quoted) { - attribSourceStart = start + 1; - attribSourceEnd = end - 1; - attribSource = attribValue; - } else { - attribSourceStart = start + 1; - attribSourceEnd = end - 1; - attribSource = attribValue.substring(1, attribValue.length - 1); - } + var end = i + 2 < decorations.length + ? decorations[i + 2] + : source.length; + nextValueIsSource = /^on|^style$/i.test(source.substring(start, end)); + } else if (style == PR_ATTRIB_VALUE) { + if (nextValueIsSource) { + var start = decorations[i]; + var end + = i + 2 < decorations.length ? decorations[i + 2] : source.length; + var attribValue = source.substring(start, end); + var attribLen = attribValue.length; + var quoted = + (attribLen >= 2 && /^[\"\']/.test(attribValue) && + attribValue.charAt(0) === attribValue.charAt(attribLen - 1)); + + var attribSource; + var attribSourceStart; + var attribSourceEnd; + if (quoted) { + attribSourceStart = start + 1; + attribSourceEnd = end - 1; + attribSource = attribValue; + } else { + attribSourceStart = start + 1; + attribSourceEnd = end - 1; + attribSource = attribValue.substring(1, attribValue.length - 1); + } - var attribSourceDecorations = PR_decorateSource(attribSource); - for (var j = 0, m = attribSourceDecorations.length; j < m; j += 2) { - attribSourceDecorations[j] += attribSourceStart; - } + var attribSourceDecorations = PR_decorateSource(attribSource); + for (var j = 0, m = attribSourceDecorations.length; j < m; j += 2) { + attribSourceDecorations[j] += attribSourceStart; + } - if (quoted) { - attribSourceDecorations.push(attribSourceEnd, PR_ATTRIB_VALUE); - PR_spliceArrayInto(attribSourceDecorations, decorations, i + 2, 0); - } else { - PR_spliceArrayInto(attribSourceDecorations, decorations, i, 2); + if (quoted) { + attribSourceDecorations.push(attribSourceEnd, PR_ATTRIB_VALUE); + PR_spliceArrayInto(attribSourceDecorations, decorations, i + 2, 0); + } else { + PR_spliceArrayInto(attribSourceDecorations, decorations, i, 2); + } } + nextValueIsSource = false; } - nextValueIsSource = false; } + return decorations; } - return decorations; -} -/** returns a list of decorations, where even entries - * - * This code treats ", ', and ` as string delimiters, and \ as a string escape. - * It does not recognize perl's qq() style strings. It has no special handling - * for double delimiter escapes as in basic, or tje tripled delimiters used in - * python, but should work on those regardless although in those cases a single - * string literal may be broken up into multiple adjacent string literals. - * - * It recognizes C, C++, and shell style comments. - * - * @param {String} sourceCode as plain text - * @return {Array.} a decoration list - */ -function PR_decorateSource(sourceCode) { - // Split into strings, comments, and other. - // We do this because strings and comments are easily recognizable and can - // contain stuff that looks like other tokens, so we want to mark those early - // so we don't recurse into them. - var decorations = PR_splitStringAndCommentTokens(sourceCode); + /** returns a list of decorations, where even entries + * + * This code treats ", ', and ` as string delimiters, and \ as a string + * escape. It does not recognize perl's qq() style strings. + * It has no special handling for double delimiter escapes as in basic, or + * the tripled delimiters used in python, but should work on those regardless + * although in those cases a single string literal may be broken up into + * multiple adjacent string literals. + * + * It recognizes C, C++, and shell style comments. + * + * @param {string} sourceCode as plain text + * @return {Array.} a decoration list + */ + function PR_decorateSource(sourceCode) { + // Split into strings, comments, and other. + // We do this because strings and comments are easily recognizable and can + // contain stuff that looks like other tokens, so we want to mark those + // early so we don't recurse into them. + var decorations = PR_splitStringAndCommentTokens(sourceCode); + + // Split non comment|string tokens on whitespace and word boundaries + decorations = PR_splitNonStringNonCommentTokens(sourceCode, decorations); - // Split non comment|string tokens on whitespace and word boundaries - decorations = PR_splitNonStringNonCommentTokens(sourceCode, decorations); - - return decorations; -} + return decorations; + } -/** returns a decoration list given a string of markup. - * - * This code recognizes a number of constructs. - * comment - * declaration - * <\w ... > tag - * tag - * embedded source - * <%...%> embedded source - * &[#\w]...; entity - * - * It does not recognizes %foo; doctype entities from . - * - * It will recurse into any