Skip to content
This repository has been archived by the owner on Apr 22, 2020. It is now read-only.

Commit

Permalink
implemented language specific formatters, fixed python docstrings, an…
Browse files Browse the repository at this point in the history
…d slashes inside regular expression charsets.
  • Loading branch information
mikesamuel committed Jul 5, 2008
1 parent f5c2c36 commit e351373
Show file tree
Hide file tree
Showing 4 changed files with 732 additions and 113 deletions.
4 changes: 4 additions & 0 deletions CHANGES.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ <h2>29 March 2007</h2>
>patch</a>
<li>Added a <a href="http://google-code-prettify.googlecode.com/files/prettify-small.zip">distribution</a> that has comments and
whitespace removed to reduce download size from 45.5kB to 12.8kB.
<li>Added <a href="http://code.google.com/p/google-code-prettify/issues/detail?id=17">language specific formatters</a> that are triggered by the presence
of a <code>lang-&lt;language-file-extension&gt;</code></li>
<li>Fixed <a href="http://code.google.com/p/google-code-prettify/issues/detail?id=29">bug</a>: python handling of <code>'''string'''</code>
<li>Fixed bug: <code>/</code> in regex <code>[charsets] should not end regex</code>
</ul>
</body>
</html>
15 changes: 12 additions & 3 deletions README.html
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,17 @@ <h3>Which languages does it work for?</h3>
CAML-like languages.</p>

<h3>How do I specify which language my code is in?</h3>
<p>There's no way to tell it which language because would complicate the
interface. If it doesn't guess the language properly, that's a bug.</p>
<p>You don't need to specify the language since <code>prettyprint()</code>
will guess. You can specify a language by specifying the language extension
along with the <code>prettyprint</code> class like so:</p>
<code class="prettyprint lang-html">
&lt;pre class=&quot;prettyprint <b>lang-html</b>&quot;&gt;<br>
&nbsp; The lang-* class specifies the language file extensions.<br>
&nbsp; Supported file extensions include<br>
&nbsp; &nbsp; "c", "cc", "cpp", "cs", "cyc", "java", "bsh", "csh", "sh",<br>
&nbsp; &nbsp; "cv", "py", "perl", "pl", "pm", "rb", "js",<br>
&nbsp; &nbsp; "html", "html", "xhtml", "xml", "xsl".<br>
&lt;/pre&gt;</code>

<h3>It doesn't work on <tt>&lt;obfuscated code sample&gt;</tt>?</h3>
<p>Yes. Prettifying obfuscated code is like putting lipstick on a pig
Expand All @@ -93,7 +102,7 @@ <h3>What's changed?</h3>
<div class="footer">
<!-- Created: Tue Oct 3 17:51:56 PDT 2006 -->
<!-- hhmts start -->
Last modified: Mon Oct 9 16:47:24 PDT 2006
Last modified: Fri Jul 4 20:49:30 PDT 2008
<!-- hhmts end -->
</div>
</body>
Expand Down
270 changes: 162 additions & 108 deletions src/prettify.js
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ function pr_isIE6() {
"BEGIN END ";
var SH_KEYWORDS = "break case continue do done elif else esac eval fi for " +
"function if in local set then until while ";
var ALL_KEYWORD_SET = wordSet(
var ALL_KEYWORDS = (
CPP_KEYWORDS + CSHARP_KEYWORDS + JSCRIPT_KEYWORDS + PERL_KEYWORDS +
PYTHON_KEYWORDS + RUBY_KEYWORDS + SH_KEYWORDS);

Expand Down Expand Up @@ -270,6 +270,7 @@ function pr_isIE6() {
var pr_aposEnt = /&apos;/g;
var pr_quotEnt = /&quot;/g;
var pr_ampEnt = /&amp;/g;
var pr_nbspEnt = /&nbsp;/g;
/** unescapes html to plain text. */
function htmlToText(html) {
var pos = html.indexOf('&');
Expand Down Expand Up @@ -298,7 +299,8 @@ function pr_isIE6() {
.replace(pr_gtEnt, '>')
.replace(pr_aposEnt, "'")
.replace(pr_quotEnt, '"')
.replace(pr_ampEnt, '&');
.replace(pr_ampEnt, '&')
.replace(pr_nbspEnt, ' ');
}

/** is the given node's innerHTML normally unescaped? */
Expand Down Expand Up @@ -333,7 +335,7 @@ function pr_isIE6() {
break;
}
}

var PR_innerHtmlWorks = null;
function getInnerHtml(node) {
// inner html is hopelessly broken in Safari 2.0.4 when the content is
Expand Down Expand Up @@ -497,7 +499,7 @@ function pr_isIE6() {
* function that takes source code and returns a list of decorations.
*/
function createSimpleLexer(shortcutStylePatterns,
fallthroughStylePatterns) {
fallthroughStylePatterns) {
var shortcuts = {};
(function () {
var allPatterns = shortcutStylePatterns.concat(fallthroughStylePatterns);
Expand Down Expand Up @@ -562,76 +564,6 @@ function pr_isIE6() {
};
}

var PR_C_STYLE_STRING_AND_COMMENT_LEXER = createSimpleLexer([
[PR_STRING, /^\'(?:[^\\\']|\\[\s\S])*(?:\'|$)/, null, "'"],
[PR_STRING, /^\"(?:[^\\\"]|\\[\s\S])*(?:\"|$)/, null, '"'],
[PR_STRING, /^\`(?:[^\\\`]|\\[\s\S])*(?:\`|$)/, null, '`']
], [
[PR_PLAIN, /^(?:[^\'\"\`\/\#]+)/, null, ' \r\n'],
[PR_COMMENT, /^#[^\r\n]*/, null, '#'],
[PR_COMMENT, /^\/\/[^\r\n]*/, null],
[PR_STRING, /^\/(?:[^\\\*\/]|\\[\s\S])+(?:\/|$)/,
REGEXP_PRECEDER_PATTERN],
[PR_COMMENT, /^\/\*[\s\S]*?(?:\*\/|$)/, null]
]);
/** splits the given string into comment, string, and "other" tokens.
* @param {string} sourceCode as plain text
* @return {Array.<number|string>} a decoration list.
* @private
*/
function splitStringAndCommentTokens(sourceCode) {
return PR_C_STYLE_STRING_AND_COMMENT_LEXER(sourceCode);
}

var PR_C_STYLE_LITERAL_IDENTIFIER_PUNC_RECOGNIZER = createSimpleLexer([], [
[PR_PLAIN, /^\s+/, null, ' \r\n'],
// TODO(mikesamuel): recognize non-latin letters and numerals in idents
[PR_PLAIN, /^[a-z_$@][a-z_$@0-9]*/i, null],
// A hex number
[PR_LITERAL, /^0x[a-f0-9]+[a-z]/i, null],
// An octal or decimal number, possibly in scientific notation
[PR_LITERAL, /^(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d+)(?:e[+\-]?\d+)?[a-z]*/i,
null, '123456789'],
[PR_PUNCTUATION, /^[^\s\w\.$@]+/, null]
// Fallback will handle decimal points not adjacent to a digit
]);

/** splits plain text tokens into more specific tokens, and then tries to
* recognize keywords, and types.
* @private
*/
function splitNonStringNonCommentTokens(source, decorations) {
for (var i = 0; i < decorations.length; i += 2) {
var style = decorations[i + 1];
if (style === PR_PLAIN) {
var start, end, chunk, subDecs;
start = decorations[i];
end = i + 2 < decorations.length ? decorations[i + 2] : source.length;
chunk = source.substring(start, end);
subDecs = PR_C_STYLE_LITERAL_IDENTIFIER_PUNC_RECOGNIZER(chunk, start);
for (var j = 0, m = subDecs.length; j < m; j += 2) {
var subStyle = subDecs[j + 1];
if (subStyle === PR_PLAIN) {
var subStart = subDecs[j];
var subEnd = j + 2 < m ? subDecs[j + 2] : chunk.length;
var token = source.substring(subStart, subEnd);
if (token === '.') {
subDecs[j + 1] = PR_PUNCTUATION;
} else if (token in ALL_KEYWORD_SET) {
subDecs[j + 1] = PR_KEYWORD;
} else if (/^@?[A-Z][A-Z$]*[a-z][A-Za-z$]*$/.test(token)) {
// classify types and annotations using Java's style conventions
subDecs[j + 1] = token.charAt(0) === '@' ? PR_LITERAL : PR_TYPE;
}
}
}
spliceArrayInto(subDecs, decorations, i, 2);
i += subDecs.length - 2;
}
}
return decorations;
}

var PR_MARKUP_LEXER = createSimpleLexer([], [
[PR_PLAIN, /^[^<]+/, null],
[PR_DECLARATION, /^<!\w[^>]*(?:>|$)/, null],
Expand Down Expand Up @@ -704,7 +636,7 @@ function pr_isIE6() {
return decorations;
}

/** returns a list of decorations, where even entries
/** returns a function that produces a list of decorations from source text.
*
* This code treats ", ', and ` as string delimiters, and \ as a string
* escape. It does not recognize perl's qq() style strings.
Expand All @@ -715,30 +647,130 @@ function pr_isIE6() {
*
* It recognizes C, C++, and shell style comments.
*
* @param {string} sourceCode as plain text
* @return {Array.<string|number>} a decoration list
* @param {Object} options a set of optional parameters.
* @return {function (sourceCode : string) : Array.<string|number>} a
* decorator that takes sourceCode as plain text and that returns a
* decoration list
*/
function decorateSource(sourceCode) {
// Split into strings, comments, and other.
// We do this because strings and comments are easily recognizable and can
// contain stuff that looks like other tokens, so we want to mark those
// early so we don't recurse into them.
var decorations = splitStringAndCommentTokens(sourceCode);
function sourceDecorator(options) {
var shortcutStylePatterns = [], fallthroughStylePatterns = [];
if (options.tripleQuotedStrings) {
shortcutStylePatterns.push(
[PR_STRING, /^(?:\'\'\'(?:[^\'\\]|\\[\s\S]|\'{1,2}(?=[^\']))*(?:\'\'\'|$)|\"\"\"(?:[^\"\\]|\\[\s\S]|\"{1,2}(?=[^\"]))*(?:\"\"\"|$)|\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$))/,
null, '\'"']);
} else if (options.multiLineStrings) {
shortcutStylePatterns.push(
[PR_STRING, /^(?:\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$)|\`(?:[^\\\`]|\\[\s\S])*(?:\`|$))/,
null, '\'"`']);
} else {
shortcutStylePatterns.push(
[PR_STRING,
/^(?:\'(?:[^\\\'\r\n]|\\.)*(?:\'|$)|\"(?:[^\\\"\r\n]|\\.)*(?:\"|$))/,
null, '"\'']);
}
fallthroughStylePatterns.push(
[PR_PLAIN, /^(?:[^\'\"\`\/\#]+)/, null, ' \r\n']);
if (options.hashComments) {
shortcutStylePatterns.push([PR_COMMENT, /^#[^\r\n]*/, null, '#']);
}
if (options.cStyleComments) {
fallthroughStylePatterns.push([PR_COMMENT, /^\/\/[^\r\n]*/, null]);
}
if (options.regexLiterals) {
fallthroughStylePatterns.push(
[PR_STRING,
/^\/(?:[^\\\*\/\[]|\\[\s\S]|\[(?:[^\]\\]|\\.)*(?:\]|$))+(?:\/|$)/,
REGEXP_PRECEDER_PATTERN]);
}
if (options.cStyleComments) {
fallthroughStylePatterns.push(
[PR_COMMENT, /^\/\*[\s\S]*?(?:\*\/|$)/, null]);
}

// Split non comment|string tokens on whitespace and word boundaries
decorations = splitNonStringNonCommentTokens(sourceCode, decorations);
var keywords = wordSet(options.keywords);

options = null;

/** splits the given string into comment, string, and "other" tokens.
* @param {string} sourceCode as plain text
* @return {Array.<number|string>} a decoration list.
* @private
*/
var splitStringAndCommentTokens = createSimpleLexer(
shortcutStylePatterns, fallthroughStylePatterns);

var styleLiteralIdentifierPuncRecognizer = createSimpleLexer([], [
[PR_PLAIN, /^\s+/, null, ' \r\n'],
// TODO(mikesamuel): recognize non-latin letters and numerals in idents
[PR_PLAIN, /^[a-z_$@][a-z_$@0-9]*/i, null],
// A hex number
[PR_LITERAL, /^0x[a-f0-9]+[a-z]/i, null],
// An octal or decimal number, possibly in scientific notation
[PR_LITERAL,
/^(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d+)(?:e[+\-]?\d+)?[a-z]*/i,
null, '123456789'],
[PR_PUNCTUATION, /^[^\s\w\.$@]+/, null]
// Fallback will handle decimal points not adjacent to a digit
]);

return decorations;
}
/** splits plain text tokens into more specific tokens, and then tries to
* recognize keywords, and types.
* @private
*/
function splitNonStringNonCommentTokens(source, decorations) {
for (var i = 0; i < decorations.length; i += 2) {
var style = decorations[i + 1];
if (style === PR_PLAIN) {
var start, end, chunk, subDecs;
start = decorations[i];
end = i + 2 < decorations.length ? decorations[i + 2] : source.length;
chunk = source.substring(start, end);
subDecs = styleLiteralIdentifierPuncRecognizer(chunk, start);
for (var j = 0, m = subDecs.length; j < m; j += 2) {
var subStyle = subDecs[j + 1];
if (subStyle === PR_PLAIN) {
var subStart = subDecs[j];
var subEnd = j + 2 < m ? subDecs[j + 2] : chunk.length;
var token = source.substring(subStart, subEnd);
if (token === '.') {
subDecs[j + 1] = PR_PUNCTUATION;
} else if (token in keywords) {
subDecs[j + 1] = PR_KEYWORD;
} else if (/^@?[A-Z][A-Z$]*[a-z][A-Za-z$]*$/.test(token)) {
// classify types and annotations using Java's style conventions
subDecs[j + 1] = token.charAt(0) === '@' ? PR_LITERAL : PR_TYPE;
}
}
}
spliceArrayInto(subDecs, decorations, i, 2);
i += subDecs.length - 2;
}
}
return decorations;
}

function cSourceDecorator(keywords, opt_options) {
return decorateSource; // TODO: implement me
}
return function (sourceCode) {
// Split into strings, comments, and other.
// We do this because strings and comments are easily recognizable and can
// contain stuff that looks like other tokens, so we want to mark those
// early so we don't recurse into them.
var decorations = splitStringAndCommentTokens(sourceCode);

// Split non comment|string tokens on whitespace and word boundaries
decorations = splitNonStringNonCommentTokens(sourceCode, decorations);

function shellSourceDecorator(keywords, opt_options) {
return decorateSource; // TODO: implement me
return decorations;
};
}

var decorateSource = sourceDecorator({
keywords: ALL_KEYWORDS,
hashComments: true,
cStyleComments: true,
multiLineStrings: true,
regexLiterals: true
});

/** identify regions of markup that are really source code, and recursivley
* lex them.
* @private
Expand Down Expand Up @@ -958,22 +990,44 @@ function pr_isIE6() {
}
registerLangHandler(decorateSource, ['default-code']);
registerLangHandler(decorateMarkup,
['default-markup', 'html', 'htm', 'xhtml', 'xml']);
registerLangHandler(cSourceDecorator(CPP_KEYWORDS),
['c', 'cc', 'cpp', 'cs', 'cxx', 'cyc']);
registerLangHandler(cSourceDecorator(JAVA_KEYWORDS), ['java']);
registerLangHandler(shellSourceDecorator(SH_KEYWORDS), ['csh', 'sh']);
registerLangHandler(
shellSourceDecorator(PYTHON_KEYWORDS), ['cv', 'py'],
{ tripleQuotedStrings: true });
registerLangHandler(
shellSourceDecorator(PERL_KEYWORDS,
{ regexLiteral: true, multiLineStrings: true }), ['pl']);
registerLangHandler(
shellSourceDecorator(RUBY_KEYWORDS,
{ regexLiteral: true, multiLineStrings: true }), ['rb']);
registerLangHandler(
cSourceDecorator(JSCRIPT_KEYWORDS, { regexLiteral: true }), ['js']);
['default-markup', 'html', 'htm', 'xhtml', 'xml', 'xsl']);
registerLangHandler(sourceDecorator({
keywords: CPP_KEYWORDS,
hashComments: true,
cStyleComments: true
}), ['c', 'cc', 'cpp', 'cs', 'cxx', 'cyc']);
registerLangHandler(sourceDecorator({
keywords: JAVA_KEYWORDS,
cStyleComments: true
}), ['java']);
registerLangHandler(sourceDecorator({
keywords: SH_KEYWORDS,
hashComments: true,
multiLineStrings: true
}), ['bsh', 'csh', 'sh']);
registerLangHandler(sourceDecorator({
keywords: PYTHON_KEYWORDS,
hashComments: true,
multiLineStrings: true,
tripleQuotedStrings: true
}), ['cv', 'py']);
registerLangHandler(sourceDecorator({
keywords: PERL_KEYWORDS,
hashComments: true,
multiLineStrings: true,
regexLiterals: true
}), ['perl', 'pl', 'pm']);
registerLangHandler(sourceDecorator({
keywords: RUBY_KEYWORDS,
hashComments: true,
multiLineStrings: true,
regexLiterals: true
}), ['rb']);
registerLangHandler(sourceDecorator({
keywords: JSCRIPT_KEYWORDS,
cStyleComments: true,
regexLiterals: true
}), ['js']);

function prettyPrintOne(sourceCodeHtml, opt_langExtension) {
try {
Expand Down
Loading

0 comments on commit e351373

Please sign in to comment.