Skip to content

Commit

Permalink
ACM-12279 | feat: Add a sql parser feature
Browse files Browse the repository at this point in the history
Adds sql_parser, state_machine, string_parser, string_scanner utlity.
They are all needed for the SQLParser.
  • Loading branch information
ziccardi committed Jun 19, 2024
1 parent 2ad1e6f commit 21774a5
Show file tree
Hide file tree
Showing 24 changed files with 2,203 additions and 0 deletions.
88 changes: 88 additions & 0 deletions pkg/utils/parser/sql_parser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
## The SQL Parser

The SQL parser parses and validates a SQL string.
**WARNING** This version of the code does not pretend to be a complete SQL parser. It is currently intended to parse only WHERE clauses.

It parses the string by feeding a SQL grammar and a SQLScanner to the `StringParser` object.

Additionally, it will return two values that you can use to pass the SQL string to your database.

Those values are:
* Query string: this is the same as the received query, but all the values are replaced with `?`, so that you can feed the prepared statement to the DB
* Values []interface{}: this contains all the values to be passed to the DB, in the right order , to replace the `?`

For example, parsing the following SQL string

```sql
COMPANY_NAME='Red Hat' and COUNTRY='Ireland'
```
you will get:
```sql
Query: "COMPANY_NAME = ? and COUNTRY = ?
Values: "Red Hat", "Ireland"
```
### Instantiating the parser
The parser uses the `functional options` pattern. Instantiating it with all the defaults is as easy as calling one function:
```go
parser := NewSQLParser()
```
The `NewSQLParser` function takes a variadic list of `SQLParserOption` that can be passed to configure the parser instance.
#### Supported options
##### WithValidColumns( validColumns ...string)
This can be used to limit the column the user can insert into the SQL string.
For example, this will lead to a validation error
```go
parser := NewSQLParser(WithValidColumns("surname"))
_, _, err := parser.Parse("name = 'mickey' and surname = 'mouse'")
fmt.Println(err)
---- output
[1] error parsing the filter: invalid column name: 'name', valid values are: [surname]
```
The number in the square bracket represent the position in the string where the error occurred.
##### WithMaximumComplexity( maximumComplexity int )
This can be used to specify the maximum number of logical operator allowed into the query
```go
parser := NewSQLParser(
WithMaximumComplexity(2),
)
_, _, err := parser.Parse("(name = 'mickey' or name = 'minnie') and surname = 'mouse' and age > 20")
fmt.Println(err)
---- output
[60] error parsing the filter: maximum number of permitted joins (2) exceeded
```
##### WithColumnPrefix(columnPrefix string)
This option specifies the prefix to be added to each column in the produced output qry.
For example, if we want every column to be prefixed with 'main.', we will use the following code
```go
parser := NewSQLParser(WithColumnPrefix("main"))
qry, _, _ := parser.Parse("(name = 'mickey' or name = 'minnie') and surname = 'mouse' and age >= 20")
fmt.Println(qry)
---- output
(main.name = ? or main.name = ?) and main.surname = ? and main.age >= ?
```
##### All the options together
```go
parser := NewSQLParser(
WithValidColumns("surname"),
WithColumnPrefix("main"),
WithMaximumComplexity(2),
)
qry, _, err := parser.Parse("(name = 'mickey' or name = 'minnie') and surname = 'mouse' and age >= 20")
fmt.Println("err: ", err)
fmt.Println("qry: ", qry)
---- output
err: [2] error parsing the filter: invalid column name: 'name', valid values are: [surname age]
qry:
```
111 changes: 111 additions & 0 deletions pkg/utils/parser/sql_parser/sql_grammar.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package sql_parser

import (
. "github.com/openshift-online/ocm-common/pkg/utils/parser/state_machine"
. "github.com/openshift-online/ocm-common/pkg/utils/parser/string_parser"
)

const (
braceTokenFamily = "BRACE"
opTokenFamily = "OP"
logicalOpTokenFamily = "LOGICAL"
columnTokenFamily = "COLUMN"

othersTokenFamily = "OTHERS"
valueTokenFamily = "VALUE"
quotedValueTokenFamily = "QUOTED"
openBrace = "OPEN_BRACE"
closedBrace = "CLOSED_BRACE"
comma = "COMMA"
column = "COLUMN"
value = "VALUE"
quotedValue = "QUOTED_VALUE"
eq = "EQ"
notEq = "NOT_EQ"
gt = "GREATER_THAN"
lt = "LESS_THAN"
gte = "GREATER_THAN_OR_EQUAL"
lte = "LESS_THAN_OR_EQUAL"
like = "LIKE"
ilike = "ILIKE"
in = "IN"
listOpenBrace = "LIST_OPEN_BRACE"
quotedValueInList = "QUOTED_VALUE_IN_LIST"
valueInList = "VALUE_IN_LIST"
and = "AND"
or = "OR"
not = "NOT"

// Define the names of the tokens to be parsed

jsonbFamily = "JSONB" // Each JSONB token will be associated to the JSONB family
jsonbField = "JSON_FIELD" // Each JSONB field
jsonbArrow = "JSONB_ARROW" // The JSONB arrow token (->)
jsonbToString = "JSONB_TOSTRING" // The JSONB to-string token (->>)
jsonbContains = "@>" // The JSONB @> token
jsonbFieldToStringify = "JSONB_FIELD_TO_STRINGIFY" // The field that will contain the `string` value, ie: ->> FIELD
)

func BasicSQLGrammar() Grammar {
grammar := Grammar{
Tokens: []TokenDefinition{
{Name: openBrace, StateData: braceTokenFamily, Acceptor: StringAcceptor(`(`)},
{Name: closedBrace, StateData: braceTokenFamily, Acceptor: StringAcceptor(`)`)},
{Name: column, StateData: columnTokenFamily, Acceptor: RegexpAcceptor(`(?i)[A-Z][A-Z0-9_.]*`)},
{Name: value, StateData: valueTokenFamily, Acceptor: RegexpAcceptor(`[^'() ]*`)},
{Name: quotedValue, StateData: quotedValueTokenFamily, Acceptor: RegexpAcceptor(`'([^']|\\')*'`)},
{Name: eq, StateData: opTokenFamily, Acceptor: StringAcceptor(`=`)},
{Name: gt, StateData: opTokenFamily, Acceptor: StringAcceptor(`>`)},
{Name: lt, StateData: opTokenFamily, Acceptor: StringAcceptor(`<`)},
{Name: gte, StateData: opTokenFamily, Acceptor: StringAcceptor(`>=`)},
{Name: lte, StateData: opTokenFamily, Acceptor: StringAcceptor(`<=`)},
{Name: comma, Acceptor: StringAcceptor(`,`)},
{Name: notEq, StateData: opTokenFamily, Acceptor: StringAcceptor(`<>`)},
{Name: like, StateData: opTokenFamily, Acceptor: RegexpAcceptor(`(?i)LIKE`)},
{Name: ilike, StateData: opTokenFamily, Acceptor: RegexpAcceptor(`(?i)ILIKE`)},
{Name: in, StateData: opTokenFamily, Acceptor: RegexpAcceptor(`(?i)IN`)},
{Name: listOpenBrace, StateData: braceTokenFamily, Acceptor: StringAcceptor(`(`)},
{Name: quotedValueInList, StateData: quotedValueTokenFamily, Acceptor: RegexpAcceptor(`'([^']|\\')*'`)},
{Name: valueInList, StateData: valueTokenFamily, Acceptor: RegexpAcceptor(`[^'() ]*`)},
{Name: and, StateData: logicalOpTokenFamily, Acceptor: RegexpAcceptor(`(?i)AND`)},
{Name: or, StateData: logicalOpTokenFamily, Acceptor: RegexpAcceptor(`(?i)OR`)},
{Name: not, StateData: logicalOpTokenFamily, Acceptor: RegexpAcceptor(`(?i)NOT`)},
{Name: jsonbArrow, StateData: jsonbFamily, Acceptor: StringAcceptor(`->`)},
{Name: jsonbField, StateData: jsonbFamily, Acceptor: RegexpAcceptor(`'([^']|\\')*'`)},
{Name: jsonbToString, StateData: jsonbFamily, Acceptor: StringAcceptor(`->>`)},
{Name: jsonbContains, StateData: jsonbFamily, Acceptor: StringAcceptor(`@>`)},
{Name: jsonbFieldToStringify, StateData: jsonbFamily, Acceptor: RegexpAcceptor(`'([^']|\\')*'`)},
},
Transitions: []TokenTransitions{
{TokenName: StartState, ValidTransitions: []string{column, openBrace}},
{TokenName: openBrace, ValidTransitions: []string{column, openBrace}},
{TokenName: column, ValidTransitions: []string{gt, lt, gte, lte, eq, notEq, like, ilike, in, not, jsonbArrow}},
{TokenName: eq, ValidTransitions: []string{quotedValue, value}},
{TokenName: notEq, ValidTransitions: []string{quotedValue, value}},
{TokenName: gt, ValidTransitions: []string{quotedValue, value}},
{TokenName: lt, ValidTransitions: []string{quotedValue, value}},
{TokenName: lte, ValidTransitions: []string{quotedValue, value}},
{TokenName: gte, ValidTransitions: []string{quotedValue, value}},
{TokenName: like, ValidTransitions: []string{quotedValue, value}},
{TokenName: ilike, ValidTransitions: []string{quotedValue, value}},
{TokenName: quotedValue, ValidTransitions: []string{or, and, closedBrace, EndState}},
{TokenName: value, ValidTransitions: []string{or, and, closedBrace, EndState}},
{TokenName: closedBrace, ValidTransitions: []string{or, and, closedBrace, EndState}},
{TokenName: and, ValidTransitions: []string{column, openBrace}},
{TokenName: or, ValidTransitions: []string{column, openBrace}},
{TokenName: not, ValidTransitions: []string{in}},
{TokenName: in, ValidTransitions: []string{listOpenBrace}},
{TokenName: listOpenBrace, ValidTransitions: []string{quotedValueInList, valueInList}},
{TokenName: quotedValueInList, ValidTransitions: []string{comma, closedBrace}},
{TokenName: valueInList, ValidTransitions: []string{comma, closedBrace}},
{TokenName: comma, ValidTransitions: []string{quotedValueInList, valueInList}},
{TokenName: jsonbArrow, ValidTransitions: []string{jsonbField}},
{TokenName: jsonbField, ValidTransitions: []string{jsonbArrow, jsonbToString, jsonbContains}},
{TokenName: jsonbToString, ValidTransitions: []string{jsonbFieldToStringify}},
{TokenName: jsonbFieldToStringify, ValidTransitions: []string{eq, notEq, like, ilike, in, not}},
{TokenName: jsonbContains, ValidTransitions: []string{quotedValue}},
},
}

return grammar
}
131 changes: 131 additions & 0 deletions pkg/utils/parser/sql_parser/sql_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package sql_parser

import (
"fmt"
"github.com/openshift-online/ocm-common/pkg/utils/parser/state_machine"
"github.com/openshift-online/ocm-common/pkg/utils/parser/string_parser"
"strings"
)

const defaultMaximumComplexity = 10

// SQLParser - This object is to be used to parse and validate WHERE clauses (only portion after the `WHERE` is supported)
type SQLParser interface {
// Parse - parses the received SQL string and returns the parsed values or an error
// Returns:
// - string: The parsed SQL replacing all the values with '?' placeholders
// - interface{}: All the values to pass to the database (to replace the '?' placeholders)
// - error: non nil in case of any error
Parse(sql string) (string, interface{}, error)
}

type sqlParser struct {
// configuration
maximumComplexity int
parser *string_parser.StringParser

// current parsing state
// counts the number of joins
complexity int
// counts the number of braces to be closed
openBraces int
validColumns []string
columnPrefix string

// current parsing result
resultQry string
resultValues []interface{}
}

var _ SQLParser = &sqlParser{}

func (p *sqlParser) Parse(sql string) (string, interface{}, error) {
p.reset()

if err := p.parser.Parse(sql); err != nil {
return "", nil, err
}

if p.openBraces > 0 {
return "", nil, fmt.Errorf("EOF while searching for closing brace ')'")
}

p.resultQry = strings.Trim(p.resultQry, " ")
return p.resultQry, p.resultValues, nil
}

func (p *sqlParser) reset() {
p.complexity = 0
p.openBraces = 0
p.resultQry = ""
p.resultValues = nil
}

func (p *sqlParser) transitionInterceptor(_, to *state_machine.State[string, string], tokenValue string) error {
countOpenBraces := func(tok string) error {
switch tok {
case "(":
p.openBraces++
case ")":
p.openBraces--
}
if p.openBraces < 0 {
return fmt.Errorf("unexpected ')'")
}
return nil
}

tokenFamily := to.Data() // The grammar configures the custom state data as the token family
switch tokenFamily {
case braceTokenFamily:
if err := countOpenBraces(tokenValue); err != nil {
return err
}
p.resultQry += tokenValue
return nil
case valueTokenFamily:
p.resultQry += " ?"
p.resultValues = append(p.resultValues, tokenValue)
return nil
case quotedValueTokenFamily:
p.resultQry += " ?"
// unescape
tmp := strings.ReplaceAll(tokenValue, `\'`, "'")
// remove quotes:
if len(tmp) > 1 {
tmp = string([]rune(tmp)[1 : len(tmp)-1])
}
p.resultValues = append(p.resultValues, tmp)
return nil
case logicalOpTokenFamily:
p.complexity++
if p.complexity > p.maximumComplexity {
return fmt.Errorf("maximum number of permitted joins (%d) exceeded", p.maximumComplexity)
}
p.resultQry += " " + tokenValue + " "
return nil
case columnTokenFamily:
// we want column names to be lowercase
columnName := strings.ToLower(tokenValue)
if len(p.validColumns) > 0 && !contains(p.validColumns, columnName) {
return fmt.Errorf("invalid column name: '%s', valid values are: %v", tokenValue, p.validColumns)
}
if p.columnPrefix != "" && !strings.HasPrefix(columnName, p.columnPrefix+".") {
columnName = p.columnPrefix + "." + columnName
}
p.resultQry += columnName
return nil
default:
p.resultQry += " " + tokenValue
return nil
}
}

func contains(ary []string, value string) bool {
for _, v := range ary {
if v == value {
return true
}
}
return false
}
45 changes: 45 additions & 0 deletions pkg/utils/parser/sql_parser/sql_parser_builder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package sql_parser

import (
"github.com/openshift-online/ocm-common/pkg/utils/parser/string_parser"
"strings"
)

type SQLParserOption func(parser *sqlParser)

func WithValidColumns(validColumns ...string) SQLParserOption {
return func(parser *sqlParser) {
parser.validColumns = validColumns
}
}

func WithColumnPrefix(columnPrefix string) SQLParserOption {
return func(parser *sqlParser) {
parser.columnPrefix = strings.Trim(columnPrefix, " ")
}
}

func WithMaximumComplexity(maximumComplexity int) SQLParserOption {
return func(parser *sqlParser) {
parser.maximumComplexity = maximumComplexity
}
}

func NewSQLParser(options ...SQLParserOption) SQLParser {
parser := &sqlParser{
maximumComplexity: defaultMaximumComplexity,
}

for _, option := range options {
option(parser)
}

stringParser := string_parser.NewStringParserBuilder().
WithGrammar(BasicSQLGrammar()).
WithTransitionInterceptor(parser.transitionInterceptor).
WithScanner(NewSQLScanner()).
Build()

parser.parser = stringParser
return parser
}
Loading

0 comments on commit 21774a5

Please sign in to comment.