Tools to Create Formal Language Parsers
{rly} is a 100% R implementation of the common parsing tools lex
and
yacc
.
lex
is a “lexical analyzer generator”. It’s core functionality is to
split up an input stream into more usable elements. You can think of it
as a tools to help identify the interesting components in a text file,
such as ->
or %>%
in an R script.
yacc
is “yet another compiler-compiler” and the main task for it is to
take the tokens lex
provides and process them contextually.
Together, you can use them to:
- define what tokens a given language/input stream will accept
- define what R code should be executed as a language file/input stream (e.g. a program) is parsed.
This project is a R clone of Ply.
devtools::install_github("systemincloud/rly")
{rly} consists of two files : lex.R
and yacc.R
. There’s quite a bit
“going on” in those two files and they may be helpful examples of how
to structure {R6} classes.
library(rly)
The demo
directory contains several different examples. Here
are some more.
We can build a “lexer” to pull out URLs from text input. Sure, we could
just use stringi::stri_match_all_regex()
for this particular example,
but the intent is to work with a known, straightforward domain to see
how “lexing” works:
# Define what tokens we accept. In this case, just URLs
TOKENS <- c("URL")
# Build our "lexing" rules. This is an {R6} class.
# If you've never played with {R6} classes, head on
# over to <https://cran.rstudio.com/web/packages/R6/>
# to learn more about it and also take a look at the
# packages that depend on/use it (which may help you
# grok {R6} a bit better.)
URLexer <- R6::R6Class(
classname = "Lexer",
public = list(
# tell it abt the tokens we accept
tokens = TOKENS,
# we use the t_ prefix to identify that this is the
# "matcher" for the token and then give it the regular
# expression that goes with this token. The URL
# regex is a toy one that says to match http or https
# strings until it hits a space.
#
# The `t` parameter is the the full context of the token
# parser at the time it gets to this token.
#
# here, we're just printing a message out and continuing
# but we could do anything we (programmatically) want
t_URL = function(re = 'http[s]*://[^[:space:]]+', t) {
message("Found URL: ", t$value) # Be verbose when we find a URL
return(t) # we need to return the potentially modified token
},
# whenever a newline is encounterd we increment a line #
# counter. this is useful when providing contextual errors
t_newline = function(re='\\n+', t) {
t$lexer$lineno <- t$lexer$lineno + nchar(t$value)
return(NULL)
},
# the goal of the lexer is to give us valid input
# but we can ignore errors if we're just looking for
# certain things (like URLs)
t_error = function(t) {
t$lexer$skip(1)
return(t)
}
)
)
# Create out lexer
lexer <- rly::lex(URLexer)
# Feed it some data
lexer$input(s = "
http://google.com https://rstudio.org/
Not a URL https://rud.is/b Another non-URL
https://r-project.org/
https://one.more.url/with/some/extra/bits.html
")
# We'll put found URLs here (rly inefficient)
found_urls <- character(0)
# keep track of the # of invalid token info (also inefficient)
invalid <- list()
# Now, we'll iterate through the tokens we were given
repeat {
tok <- lexer$token() # get the next token
if (is.null(tok)) break # no more tokens, done with lexing
switch(
tok$type,
# Do this when we find a token identified as a `URL`
URL = found_urls <- append(found_urls, tok$value),
# Do this whenever we find an invalid token
error = invalid <- append(invalid, list(data.frame(
bad_thing = tok$value,
stream_pos = tok$lexpos,
line = tok$lineno,
stringsAsFactors = FALSE
)))
)
}
#> Found URL: http://google.com
#> Found URL: https://rstudio.org/
#> Found URL: https://rud.is/b
#> Found URL: https://r-project.org/
#> Found URL: https://one.more.url/with/some/extra/bits.html
invalid <- do.call(rbind.data.frame, invalid)
nrow(invalid) # number of errors
#> [1] 32
head(invalid, 10) # it'll be clear we never told it abt whitespace
#> bad_thing stream_pos line
#> 1 19 2
#> 2 20 2
#> 3 21 2
#> 4 22 2
#> 5 23 2
#> 6 N 45 3
#> 7 o 46 3
#> 8 t 47 3
#> 9 48 3
#> 10 a 49 3
found_urls # the good stuff
#> [1] "http://google.com"
#> [2] "https://rstudio.org/"
#> [3] "https://rud.is/b"
#> [4] "https://r-project.org/"
#> [5] "https://one.more.url/with/some/extra/bits.html"
We can extend this to do things with different types of URIs (not necessarily “http”-ish URLs)
# we'll define different token types for HTTP URLs, HTTPS URLs and
# MAILTO URLs
TOKENS <- c("HTTP_URL", "HTTPS_URL", "MAILTO_URL")
URLexer <- R6::R6Class(
classname = "Lexer",
public = list(
tokens = TOKENS,
# three different token regexes
t_HTTPS_URL = function(re = 'https://[^[:space:]]+', t) {
message("Found HTTPS URL: ", t$value)
return(t)
},
t_MAILTO_URL = function(re = 'mailto:[^[:space:]]+', t) {
message("Found MAILTO URL: ", t$value)
return(t)
},
t_HTTP_URL = function(re = 'http://[^[:space:]]+', t) {
message("Found HTTP URL: ", t$value)
return(t)
},
t_error = function(t) {
t$lexer$skip(1) # if we don't do this the lexer will error out on tokens we don't match (which is usually what we want)
return(t)
}
)
)
# Create out lexer
lexer <- rly::lex(URLexer)
# Feed it some data
lexer$input(s = "
http://google.com https://rstudio.org/
Not a URL https://rud.is/b Another non-URL mailto:[email protected]?subject=Hello
https://r-project.org/
mailto:[email protected]
https://one.more.url/with/some/extra/bits.html
")
http_urls <- character(0)
https_urls <- character(0)
mailto_urls <- character(0)
repeat {
tok <- lexer$token() # get the next token
if (is.null(tok)) break # no more tokens, done with lexing
switch(
tok$type,
HTTP_URL = http_urls <- append(http_urls, tok$value),
HTTPS_URL = https_urls <- append(https_urls, tok$value),
MAILTO_URL = mailto_urls <- append(mailto_urls, tok$value)
)
}
#> Found HTTP URL: http://google.com
#> Found HTTPS URL: https://rstudio.org/
#> Found HTTPS URL: https://rud.is/b
#> Found MAILTO URL: mailto:[email protected]?subject=Hello
#> Found HTTPS URL: https://r-project.org/
#> Found MAILTO URL: mailto:[email protected]
#> Found HTTPS URL: https://one.more.url/with/some/extra/bits.html
http_urls
#> [1] "http://google.com"
https_urls
#> [1] "https://rstudio.org/"
#> [2] "https://rud.is/b"
#> [3] "https://r-project.org/"
#> [4] "https://one.more.url/with/some/extra/bits.html"
mailto_urls
#> [1] "mailto:[email protected]?subject=Hello"
#> [2] "mailto:[email protected]"
Here is an example showing a {rly} implementation of a calculator with variables.
Let’s bootstrap our tokenizer/lexer:
TOKENS = c('NAME', 'NUMBER')
LITERALS = c('=', '+', '-', '*', '/', '(', ')') # these are "LEXEMES" (ref: https://stackoverflow.com/questions/14954721/what-is-the-difference-between-a-token-and-a-lexeme)
Lexer <- R6::R6Class(
classname = "Lexer",
public = list(
tokens = TOKENS,
literals = LITERALS,
t_NAME = '[a-zA-Z_][a-zA-Z0-9_]*',
t_NUMBER = function(re='\\d+', t) {
t$value <- strtoi(t$value)
return(t)
},
t_ignore = " \t",
t_newline = function(re='\\n+', t) {
t$lexer$lineno <- t$lexer$lineno + nchar(t$value)
return(NULL)
},
t_error = function(t) {
cat(sprintf("Illegal character '%s'", t$value[1]))
t$lexer$skip(1)
return(t)
}
)
)
Now, we write our expression parser. Note that we use TOKENS
and
LITERALS
from the lexer we just wrote so the parser has some context
for what gets passed to it as the tokeneizer emits bits to it:
Parser <- R6::R6Class(
classname = "Parser",
public = list(
tokens = TOKENS,
literals = LITERALS,
# Parsing rules
precedence = list(
c('left', '+', '-'),
c('left', '*', '/'),
c('right', 'UMINUS')
),
# dictionary of names (can be inefficient but it's cool here)
names = new.env(hash=TRUE),
# One type of "statement" is NAME=expression
p_statement_assign = function(doc='statement : NAME "=" expression', p) {
self$names[[as.character(p$get(2))]] <- p$get(4)
},
# Another type of "statement" is just an expression
p_statement_expr = function(doc='statement : expression', p) {
cat(p$get(2))
cat('\n')
},
# Classic simple definition of an expression
p_expression_binop = function(doc="expression : expression '+' expression
| expression '-' expression
| expression '*' expression
| expression '/' expression", p) {
if(p$get(3) == '+') p$set(1, p$get(2) + p$get(4))
else if(p$get(3) == '-') p$set(1, p$get(2) - p$get(4))
else if(p$get(3) == '*') p$set(1, p$get(2) * p$get(4))
else if(p$get(3) == '/') p$set(1, p$get(2) / p$get(4))
},
# unary minus is a special case we need to handle
# see https://www.ibm.com/support/knowledgecenter/en/SSLTBW_2.3.0/com.ibm.zos.v2r3.bpxa600/bpxa698.htm
# for %prec explanation
# note order does kinda matter in both lexer and parser rule specs
p_expression_uminus = function(doc="expression : '-' expression %prec UMINUS", p) {
p$set(1, -p$get(3))
},
# parnens expression
p_expression_group = function(doc="expression : '(' expression ')'", p) {
p$set(1, p$get(3))
},
p_expression_number = function(doc='expression : NUMBER', p) {
p$set(1, p$get(2))
},
p_expression_name = function(doc='expression : NAME', p) {
p$set(1, self$names[[as.character(p$get(2))]])
},
p_error = function(p) {
if(is.null(p)) cat("Syntax error at EOF")
else cat(sprintf("Syntax error at '%s'", p$value))
}
)
)
lexer <- lex(Lexer)
parser <- yacc(Parser)
# these will each end with `NULL` as that's how the `parser` signals it's done
parser$parse("3", lexer)
#> 3
#> NULL
parser$parse("3 + 5", lexer)
#> 8
#> NULL
parser$parse("3 + 5 * 10 - 100", lexer)
#> -47
#> NULL
parser$parse("A + B * C - D", lexer) # valid lexical syntax but no data to work on; in a real calculator this wld error out
#> NULL
parser$parse("A + B * C - D = E", lexer) # invalid lexical syntax
#> Syntax error at '='
#> NULL
parser$parse("A = 1 + 2", lexer) # valid syntax, still no output b/c we just did assignment
#> NULL
parser$parse("A", lexer)
#> 3
#> NULL
invisible(parser$parse("B = 5", lexer)) # using invisible() only to suppress useless NULLs
invisible(parser$parse("C = 10", lexer))
invisible(parser$parse("D = 100", lexer))
parser$parse("A + B * C - D", lexer)
#> -47
#> NULL