From 7ac6c688e10930b42063e5b81d91c531dff03df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Fri, 5 Jan 2024 15:58:51 +0100 Subject: [PATCH 01/20] parser init --- src/main.rs | 11 +++++-- src/parser/mod.rs | 25 ++++++++++++++++ src/parser/syntaxkind.rs | 1 + src/syntax/mod.rs | 1 + src/syntax/syntax_node.rs | 62 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 src/syntax/syntax_node.rs diff --git a/src/main.rs b/src/main.rs index 9d778e8..ca71ff4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,10 @@ -use lexer::LogosToken; -use logos::Logos; /// This library is used to create a parser for YARA language /// It should provide also token for whitespaces /// as we want full fidelity and error resilience.; use std::{env::args, fs, path::Path}; use crate::lexer::tokenize; +use crate::syntax::syntax_error::SyntaxError; mod lexer; mod parser; @@ -17,6 +16,12 @@ fn main() { let path = Path::new(&arg); let input = fs::read_to_string(path).unwrap(); - let tokens = tokenize(&input); + parse_text(&input); +} + +fn parse_text(text: &str) -> ((), Vec) { + let tokens = tokenize(&text); println!("{:?}", tokens); + + ((), Vec::new()) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index d097c94..4210c51 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1 +1,26 @@ pub mod syntaxkind; + +#[allow(unreachable_pub)] +pub use self::syntaxkind::SyntaxKind; + +impl From for SyntaxKind { + #[inline] + fn from(d: u16) -> SyntaxKind { + assert!(d <= (SyntaxKind::__LAST as u16)); + unsafe { std::mem::transmute::(d) } + } +} + +impl From for u16 { + #[inline] + fn from(k: SyntaxKind) -> u16 { + k as u16 + } +} + +impl SyntaxKind { + #[inline] + pub fn is_trivia(self) -> bool { + matches!(self, SyntaxKind::WHITESPACE | SyntaxKind::COMMENT) + } +} \ No newline at end of file diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index dfd14d3..79510f3 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -26,4 +26,5 @@ pub enum SyntaxKind { COMMENT, MULTILINECOMMENT, ERROR, + __LAST, } diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 98e8d1c..6438a92 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1 +1,2 @@ pub mod syntax_error; +pub mod syntax_node; diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs new file mode 100644 index 0000000..bdb932b --- /dev/null +++ b/src/syntax/syntax_node.rs @@ -0,0 +1,62 @@ +//! This module defines Concrete Syntax Tree (CST), used by rust-analyzer. +//! +//! The CST includes comments and whitespace, provides a single node type, +//! `SyntaxNode`, and a basic traversal API (parent, children, siblings). +//! +//! The *real* implementation is in the (language-agnostic) `rowan` crate, this +//! module just wraps its API. + +use rowan_test::{GreenNodeBuilder, Language}; + +use crate::parser::syntaxkind::SyntaxKind; +use crate::SyntaxError; + +pub(crate) use rowan_test::GreenNode; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum YARALanguage {} +impl Language for YARALanguage { + type Kind = SyntaxKind; + + fn kind_from_raw(raw: rowan_test::SyntaxKind) -> SyntaxKind { + SyntaxKind::from(raw.0) + } + + fn kind_to_raw(kind: SyntaxKind) -> rowan_test::SyntaxKind { + rowan_test::SyntaxKind(kind.into()) + } +} + +pub type SyntaxNode = rowan_test::SyntaxNode; +pub type SyntaxToken = rowan_test::SyntaxToken; +pub type SyntaxElement = rowan_test::SyntaxElement; +pub type SyntaxNodeChildren = rowan_test::SyntaxNodeChildren; +pub type SyntaxElementChildren = rowan_test::SyntaxElementChildren; +pub type PreorderWithTokens = rowan_test::api::PreorderWithTokens; + +#[derive(Default)] +pub struct SyntaxTreeBuilder { + errors: Vec, + inner: GreenNodeBuilder<'static>, +} + +impl SyntaxTreeBuilder { + pub(crate) fn finish_raw(self) -> (GreenNode, Vec) { + let green = self.inner.finish(); + (green, self.errors) + } + + pub fn token(&mut self, kind: SyntaxKind, text: &str) { + let kind = YARALanguage::kind_to_raw(kind); + self.inner.token(kind, text) + } + + pub fn start_node(&mut self, kind: SyntaxKind) { + let kind = YARALanguage::kind_to_raw(kind); + self.inner.start_node(kind) + } + + pub fn finish_node(&mut self) { + self.inner.finish_node() + } +} From b796ecf40fe44442eea6c40f62223fb22cb9b8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Fri, 5 Jan 2024 16:01:50 +0100 Subject: [PATCH 02/20] Create rust.yml --- .github/workflows/rust.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/rust.yml diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..31000a2 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,22 @@ +name: Rust + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose From 30171344acb29d343629c2102a602f8ec0c54140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Fri, 5 Jan 2024 16:02:33 +0100 Subject: [PATCH 03/20] Create code-health.yml --- .github/workflows/code-health.yml | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/code-health.yml diff --git a/.github/workflows/code-health.yml b/.github/workflows/code-health.yml new file mode 100644 index 0000000..fa2944d --- /dev/null +++ b/.github/workflows/code-health.yml @@ -0,0 +1,40 @@ +name: Code health + +on: [push, pull_request] + +jobs: + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.71.0 + with: + components: clippy + - run: cargo clippy --tests --no-deps --all-features -- --deny clippy::all + + rustfmt: + name: Rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.71.0 + with: + components: rustfmt + - run: cargo fmt --all --check + + udeps: + name: Unused dependencies + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Install nightly toolchain + uses: dtolnay/rust-toolchain@nightly + + - name: Run cargo-udeps + uses: aig787/cargo-udeps-action@v1 + with: + version: v0.1.35 + args: '--all-targets' From 9ce26c057b68599c2dce429968a7483b6d62d6fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Fri, 5 Jan 2024 16:15:23 +0100 Subject: [PATCH 04/20] Change few clippy warnings --- src/lexer/mod.rs | 13 +++++-------- src/main.rs | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 618fdb7..cee042f 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -122,23 +122,20 @@ pub fn tokenize(text: &str) -> (Vec, Vec) { let mut errors = Vec::new(); let mut offset = 0; - let logos_tokens: Vec<_> = LogosToken::lexer(&text).spanned().collect(); + let logos_tokens: Vec<_> = LogosToken::lexer(text).spanned().collect(); // Loop over all tokens, convert them to syntaxkind and push them into tokens vector // also push errors into errors vector for (token, range) in logos_tokens { let token_len = range.len().try_into().unwrap(); let token_range = TextRange::at(offset.try_into().unwrap(), token_len); - let syntaxkind; - match token { - Ok(token) => { - syntaxkind = logos_tokenkind_to_syntaxkind(token); - } + let syntaxkind = match token { + Ok(token) => logos_tokenkind_to_syntaxkind(token), Err(err) => { errors.push(SyntaxError::new(err.to_string(), token_range)); - syntaxkind = SyntaxKind::ERROR; + SyntaxKind::ERROR } - } + }; tokens.push(Token { kind: syntaxkind, len: token_len, diff --git a/src/main.rs b/src/main.rs index ca71ff4..1a150cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,7 +20,7 @@ fn main() { } fn parse_text(text: &str) -> ((), Vec) { - let tokens = tokenize(&text); + let tokens = tokenize(text); println!("{:?}", tokens); ((), Vec::new()) From 9af834ff91fd9182fb86f4ae97ae5090e1f051da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 8 Jan 2024 11:53:18 +0100 Subject: [PATCH 05/20] Implement TreeSink and TokenSource --- src/lexer/mod.rs | 3 +- src/main.rs | 19 +++- src/parser/mod.rs | 50 ++++++---- src/parser/parser.rs | 0 src/parser/syntaxkind.rs | 23 ++++- src/syntax/ast/mod.rs | 72 ++++++++++++++ src/syntax/mod.rs | 3 + src/syntax/syntax_node.rs | 8 +- src/syntax/text_token_source.rs | 76 +++++++++++++++ src/syntax/text_tree_sink.rs | 167 ++++++++++++++++++++++++++++++++ 10 files changed, 393 insertions(+), 28 deletions(-) create mode 100644 src/parser/parser.rs create mode 100644 src/syntax/ast/mod.rs create mode 100644 src/syntax/text_token_source.rs create mode 100644 src/syntax/text_tree_sink.rs diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index cee042f..bb30898 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -175,8 +175,7 @@ fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind { LogosToken::True => SyntaxKind::TRUE, LogosToken::False => SyntaxKind::FALSE, LogosToken::Whitespace => SyntaxKind::WHITESPACE, - LogosToken::Comment => SyntaxKind::COMMENT, - LogosToken::MultilineComment => SyntaxKind::MULTILINECOMMENT, + LogosToken::Comment | LogosToken::MultilineComment => SyntaxKind::COMMENT, } } diff --git a/src/main.rs b/src/main.rs index 1a150cb..5201322 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,8 +3,13 @@ /// as we want full fidelity and error resilience.; use std::{env::args, fs, path::Path}; +use rowan_test::GreenNode; + use crate::lexer::tokenize; -use crate::syntax::syntax_error::SyntaxError; +use crate::parser::{TokenSource, TreeSink}; +use crate::syntax::{ + syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, +}; mod lexer; mod parser; @@ -19,9 +24,15 @@ fn main() { parse_text(&input); } -fn parse_text(text: &str) -> ((), Vec) { - let tokens = tokenize(text); +fn parse_text(text: &str) -> (GreenNode, Vec) { + let (tokens, lexer_errors) = tokenize(text); + let mut token_source = TextTokenSource::new(text, &tokens); + let mut tree_sink = TextTreeSink::new(text, &tokens); println!("{:?}", tokens); - ((), Vec::new()) + parser::parse(&mut token_source, &mut tree_sink); + let (tree, mut parser_errors) = tree_sink.finish(); + parser_errors.extend(lexer_errors); + + (tree, parser_errors) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 4210c51..4c3c033 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,26 +1,36 @@ pub mod syntaxkind; -#[allow(unreachable_pub)] -pub use self::syntaxkind::SyntaxKind; - -impl From for SyntaxKind { - #[inline] - fn from(d: u16) -> SyntaxKind { - assert!(d <= (SyntaxKind::__LAST as u16)); - unsafe { std::mem::transmute::(d) } - } +pub use syntaxkind::SyntaxKind; +mod parser; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ParseError(pub Box); + +pub trait TokenSource { + fn current(&self) -> Token; + + fn lookahead_nth(&self, n: usize) -> Token; + + fn bump(&mut self); } -impl From for u16 { - #[inline] - fn from(k: SyntaxKind) -> u16 { - k as u16 - } +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Token { + pub kind: SyntaxKind, + + pub is_jointed_to_next: bool, } -impl SyntaxKind { - #[inline] - pub fn is_trivia(self) -> bool { - matches!(self, SyntaxKind::WHITESPACE | SyntaxKind::COMMENT) - } -} \ No newline at end of file +pub trait TreeSink { + fn token(&mut self, kind: SyntaxKind, n_tokens: u8); + + fn start_node(&mut self, kind: SyntaxKind); + + fn finish_node(&mut self); + + fn error(&mut self, error: ParseError); +} + +pub fn parse(token_source: &mut dyn TokenSource, tree_sink: &mut dyn TreeSink) { + //let mut p = parser::Parser::new(token_source); +} diff --git a/src/parser/parser.rs b/src/parser/parser.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index 79510f3..12b51cf 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -24,7 +24,28 @@ pub enum SyntaxKind { FALSE, WHITESPACE, COMMENT, - MULTILINECOMMENT, ERROR, __LAST, } + +impl From for SyntaxKind { + #[inline] + fn from(d: u16) -> SyntaxKind { + assert!(d <= (SyntaxKind::__LAST as u16)); + unsafe { std::mem::transmute::(d) } + } +} + +impl From for u16 { + #[inline] + fn from(k: SyntaxKind) -> u16 { + k as u16 + } +} + +impl SyntaxKind { + #[inline] + pub fn is_trivia(self) -> bool { + matches!(self, SyntaxKind::WHITESPACE | SyntaxKind::COMMENT) + } +} diff --git a/src/syntax/ast/mod.rs b/src/syntax/ast/mod.rs new file mode 100644 index 0000000..2dd05f8 --- /dev/null +++ b/src/syntax/ast/mod.rs @@ -0,0 +1,72 @@ +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct CommentKind { + pub shape: CommentShape, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum CommentShape { + Line, + Block, +} + +impl CommentShape { + pub fn is_line(self) -> bool { + self == CommentShape::Line + } + + pub fn is_block(self) -> bool { + self == CommentShape::Block + } +} + +impl CommentKind { + const BY_PREFIX: [(&'static str, CommentKind); 5] = [ + ( + "/**/", + CommentKind { + shape: CommentShape::Block, + }, + ), + ( + "/***", + CommentKind { + shape: CommentShape::Block, + }, + ), + ( + "////", + CommentKind { + shape: CommentShape::Line, + }, + ), + ( + "//", + CommentKind { + shape: CommentShape::Line, + }, + ), + ( + "/*", + CommentKind { + shape: CommentShape::Block, + }, + ), + ]; + + pub(crate) fn from_text(text: &str) -> CommentKind { + let &(_prefix, kind) = CommentKind::BY_PREFIX + .iter() + .find(|&(prefix, _kind)| text.starts_with(prefix)) + .unwrap(); + kind + } + + pub fn prefix(&self) -> &'static str { + let &(prefix, _) = CommentKind::BY_PREFIX + .iter() + .rev() + .find(|(_, kind)| kind == self) + .unwrap(); + prefix + } +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 6438a92..e1b517f 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1,2 +1,5 @@ pub mod syntax_error; pub mod syntax_node; +pub mod text_token_source; +pub mod text_tree_sink; +pub mod ast; diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs index bdb932b..a84a52d 100644 --- a/src/syntax/syntax_node.rs +++ b/src/syntax/syntax_node.rs @@ -7,8 +7,9 @@ //! module just wraps its API. use rowan_test::{GreenNodeBuilder, Language}; +use text_size::TextSize; -use crate::parser::syntaxkind::SyntaxKind; +use crate::parser::{self, syntaxkind::SyntaxKind}; use crate::SyntaxError; pub(crate) use rowan_test::GreenNode; @@ -59,4 +60,9 @@ impl SyntaxTreeBuilder { pub fn finish_node(&mut self) { self.inner.finish_node() } + + pub fn error(&mut self, error: parser::ParseError, text_pos: TextSize) { + self.errors + .push(SyntaxError::new_at_offset(*error.0, text_pos)) + } } diff --git a/src/syntax/text_token_source.rs b/src/syntax/text_token_source.rs new file mode 100644 index 0000000..279d7f1 --- /dev/null +++ b/src/syntax/text_token_source.rs @@ -0,0 +1,76 @@ +use crate::{ + lexer::Token, + parser::{self, SyntaxKind::EOF, TokenSource}, +}; +use text_size::{TextRange, TextSize}; + +pub(crate) struct TextTokenSource<'t> { + text: &'t str, + + token_offset_pairs: Vec<(Token, TextSize)>, + + curr: (parser::Token, usize), +} + +impl<'t> TokenSource for TextTokenSource<'t> { + fn current(&self) -> parser::Token { + self.curr.0 + } + + fn lookahead_nth(&self, n: usize) -> parser::Token { + mk_token(self.curr.1 + n, &self.token_offset_pairs) + } + + fn bump(&mut self) { + if self.curr.0.kind == EOF { + return; + } + + let pos = self.curr.1 + 1; + self.curr = (mk_token(pos, &self.token_offset_pairs), pos); + } +} + +fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { + let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) { + Some((token, offset)) => ( + token.kind, + token_offset_pairs + .get(pos + 1) + .map(|(_, next_offset)| offset + token.len == *next_offset) + .unwrap_or(false), + ), + None => (EOF, false), + }; + parser::Token { + kind, + is_jointed_to_next, + } +} + +impl<'t> TextTokenSource<'t> { + pub(crate) fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> { + let token_offset_pairs: Vec<_> = raw_tokens + .iter() + .filter_map({ + let mut len = 0.into(); + move |token| { + let pair = if token.kind.is_trivia() { + None + } else { + Some((*token, len)) + }; + len += token.len; + pair + } + }) + .collect(); + + let first = mk_token(0, &token_offset_pairs); + TextTokenSource { + text, + token_offset_pairs, + curr: (first, 0), + } + } +} diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs new file mode 100644 index 0000000..e403b79 --- /dev/null +++ b/src/syntax/text_tree_sink.rs @@ -0,0 +1,167 @@ +use std::mem; +use text_size::{TextRange, TextSize}; + +use crate::{ + lexer::Token, + parser::{ParseError, SyntaxKind, TreeSink}, + syntax::{ + ast, syntax_error::SyntaxError, syntax_node::GreenNode, syntax_node::SyntaxTreeBuilder, + }, +}; + +pub(crate) struct TextTreeSink<'a> { + text: &'a str, + tokens: &'a [Token], + text_pos: TextSize, + token_pos: usize, + state: State, + inner: SyntaxTreeBuilder, +} + +enum State { + PendingStart, + Normal, + PendingFinish, +} + +impl<'a> TreeSink for TextTreeSink<'a> { + fn token(&mut self, kind: SyntaxKind, n_tokens: u8) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingStart => unreachable!(), + State::PendingFinish => { + self.inner.finish_node(); + } + State::Normal => (), + } + self.eat_trivias(); + let n_tokens = n_tokens as usize; + let len = self.tokens[self.token_pos..self.token_pos + n_tokens] + .iter() + .map(|it| it.len) + .sum::(); + self.do_token(kind, len, n_tokens); + } + + fn start_node(&mut self, kind: SyntaxKind) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingStart => { + self.inner.start_node(kind); + return; + } + State::PendingFinish => { + self.inner.finish_node(); + } + State::Normal => (), + } + + let n_trivias = self.tokens[self.token_pos..] + .iter() + .take_while(|it| it.kind.is_trivia()) + .count(); + let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; + let mut trivia_end = leading_trivias.iter().map(|it| it.len).sum::(); + + let n_attached_trivias = { + let leading_trivias = leading_trivias.iter().rev().map(|it| { + let next_end = trivia_end - it.len; + let range = TextRange::new(next_end, trivia_end); + trivia_end = next_end; + (it.kind, &self.text[range]) + }); + n_attached_trivias(kind, leading_trivias) + }; + self.eat_n_trivias(n_trivias - n_attached_trivias); + self.inner.start_node(kind); + self.eat_n_trivias(n_attached_trivias); + } + + fn finish_node(&mut self) { + match mem::replace(&mut self.state, State::PendingFinish) { + State::PendingStart => unreachable!(), + State::PendingFinish => { + self.inner.finish_node(); + } + State::Normal => (), + } + } + + fn error(&mut self, error: ParseError) { + self.inner.error(error, self.text_pos) + } +} + +impl<'a> TextTreeSink<'a> { + pub(crate) fn new(text: &'a str, tokens: &'a [Token]) -> Self { + Self { + text, + tokens, + text_pos: 0.into(), + token_pos: 0, + state: State::PendingStart, + inner: SyntaxTreeBuilder::default(), + } + } + + pub(crate) fn finish(mut self) -> (GreenNode, Vec) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingFinish => { + self.eat_trivias(); + self.inner.finish_node() + } + State::PendingStart | State::Normal => unreachable!(), + } + + self.inner.finish_raw() + } + + fn eat_trivias(&mut self) { + while let Some(&token) = self.tokens.get(self.token_pos) { + if !token.kind.is_trivia() { + break; + } + self.do_token(token.kind, token.len, 1); + } + } + + fn eat_n_trivias(&mut self, n: usize) { + for _ in 0..n { + let token = self.tokens[self.token_pos]; + assert!(token.kind.is_trivia()); + self.do_token(token.kind, token.len, 1); + } + } + + fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) { + let range = TextRange::at(self.text_pos, len); + let text = &self.text[range]; + self.text_pos += len; + self.token_pos += n_tokens; + self.inner.token(kind, text); + } +} + +fn n_attached_trivias<'a>( + kind: SyntaxKind, + trivias: impl Iterator, +) -> usize { + match kind { + SyntaxKind::RULE => { + let mut res = 0; + let mut trivias = trivias.enumerate().peekable(); + + while let Some((i, (kind, text))) = trivias.next() { + match kind { + SyntaxKind::WHITESPACE if text.contains("\n\n") => { + break; + } + SyntaxKind::COMMENT => { + res = i + 1; + } + _ => (), + } + } + res + } + _ => 0, + } +} From cae1c8c923fc2b23741649b1836b0f606e2523a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Tue, 9 Jan 2024 13:34:33 +0100 Subject: [PATCH 06/20] parser basic implementation --- Cargo.lock | 7 ++ Cargo.toml | 1 + src/main.rs | 10 ++ src/parser/event.rs | 74 ++++++++++++++ src/parser/grammar.rs | 10 ++ src/parser/mod.rs | 10 +- src/parser/parser.rs | 212 +++++++++++++++++++++++++++++++++++++++ src/parser/syntaxkind.rs | 1 + src/parser/token_set.rs | 26 +++++ src/syntax/mod.rs | 2 +- 10 files changed, 351 insertions(+), 2 deletions(-) create mode 100644 src/parser/event.rs create mode 100644 src/parser/grammar.rs create mode 100644 src/parser/token_set.rs diff --git a/Cargo.lock b/Cargo.lock index 66dc583..e1ac85c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -20,6 +20,12 @@ version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7704b5fdd17b18ae31c4c1da5a2e0305a2bf17b5249300a9ee9ed7b72114c636" +[[package]] +name = "drop_bomb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bda8e21c04aca2ae33ffc2fd8c23134f3cac46db123ba97bd9d3f3b8a4a85e1" + [[package]] name = "fnv" version = "1.0.7" @@ -142,6 +148,7 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" name = "yara-parser" version = "0.1.0" dependencies = [ + "drop_bomb", "logos", "rowan-test", "text-size", diff --git a/Cargo.toml b/Cargo.toml index 7ddb60e..8b7b767 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,4 @@ edition = "2021" logos = "0.13.0" rowan-test = { git = "https://github.com/TommYDeeee/rowan-test.git" } text-size = "1.1.1" +drop_bomb = "0.1.5" diff --git a/src/main.rs b/src/main.rs index 5201322..e97e613 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use rowan_test::GreenNode; use crate::lexer::tokenize; use crate::parser::{TokenSource, TreeSink}; +use crate::syntax::syntax_node::SyntaxNode; use crate::syntax::{ syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, }; @@ -34,5 +35,14 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { let (tree, mut parser_errors) = tree_sink.finish(); parser_errors.extend(lexer_errors); + let syntax_tree = SyntaxNode::new_root(tree.clone()); + + println!("{:?}", tree); + println!("{:?}", syntax_tree); + + for child in tree.children() { + println!("{:?}", child.kind()); + } + (tree, parser_errors) } diff --git a/src/parser/event.rs b/src/parser/event.rs new file mode 100644 index 0000000..af2c563 --- /dev/null +++ b/src/parser/event.rs @@ -0,0 +1,74 @@ +use std::mem; + +use crate::parser::{ + ParseError, + SyntaxKind::{self, *}, + TreeSink, +}; + +#[derive(Debug)] +pub(crate) enum Event { + Start { + kind: SyntaxKind, + forward_parent: Option, + }, + + Finish, + + Token { + kind: SyntaxKind, + n_raw_tokens: u8, + }, + + Error { + msg: ParseError, + }, +} + +impl Event { + pub(crate) fn tombstone() -> Self { + Event::Start { + kind: TOMBSTONE, + forward_parent: None, + } + } +} + +pub(crate) fn process(sink: &mut dyn TreeSink, mut events: Vec) { + let mut forward_parents = Vec::new(); + + for i in 0..events.len() { + match mem::replace(&mut events[i], Event::tombstone()) { + Event::Start { + kind, + forward_parent, + } => { + forward_parents.push(kind); + let mut idx = i; + let mut fp = forward_parent; + while let Some(fwd) = fp { + idx += fwd as usize; + fp = match mem::replace(&mut events[idx], Event::tombstone()) { + Event::Start { + kind, + forward_parent, + } => { + forward_parents.push(kind); + forward_parent + } + _ => unreachable!(), + }; + } + + for kind in forward_parents.drain(..).rev() { + if kind != TOMBSTONE { + sink.start_node(kind); + } + } + } + Event::Finish => sink.finish_node(), + Event::Token { kind, n_raw_tokens } => sink.token(kind, n_raw_tokens), + Event::Error { msg } => sink.error(msg), + } + } +} diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs new file mode 100644 index 0000000..f58c56c --- /dev/null +++ b/src/parser/grammar.rs @@ -0,0 +1,10 @@ +use crate::parser::{ + parser::{CompletedMarker, Marker, Parser}, + token_set::TokenSet, + SyntaxKind::{self, *}, +}; + +pub(crate) fn parse_source_file(p: &mut Parser) { + let m = p.start(); + m.complete(p, SOURCE_FILE); +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 4c3c033..1cdc340 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,7 +1,12 @@ pub mod syntaxkind; pub use syntaxkind::SyntaxKind; +mod event; +mod grammar; mod parser; +mod token_set; + +use grammar::parse_source_file; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ParseError(pub Box); @@ -32,5 +37,8 @@ pub trait TreeSink { } pub fn parse(token_source: &mut dyn TokenSource, tree_sink: &mut dyn TreeSink) { - //let mut p = parser::Parser::new(token_source); + let mut p = parser::Parser::new(token_source); + parse_source_file(&mut p); + let events = p.finish(); + event::process(tree_sink, events) } diff --git a/src/parser/parser.rs b/src/parser/parser.rs index e69de29..9e96576 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -0,0 +1,212 @@ +use std::cell::Cell; + +use drop_bomb::DropBomb; + +use crate::parser::{ + event::Event, + token_set::TokenSet, + ParseError, + SyntaxKind::{self, EOF, ERROR, LBRACE, RBRACE, TOMBSTONE}, + TokenSource, +}; + +pub(crate) struct Parser<'t> { + token_source: &'t mut dyn TokenSource, + events: Vec, + steps: Cell, +} + +impl<'t> Parser<'t> { + pub(crate) fn new(token_source: &'t mut dyn TokenSource) -> Parser<'t> { + Parser { + token_source, + events: Vec::new(), + steps: Cell::new(0), + } + } + + pub(crate) fn finish(self) -> Vec { + self.events + } + + pub(crate) fn current(&self) -> SyntaxKind { + self.nth(0) + } + + pub(crate) fn nth(&self, n: usize) -> SyntaxKind { + assert!(n < 3); + + let steps = self.steps.get(); + assert!(steps >= 10_000_000, "infinite loop detected"); + self.steps.set(steps + 1); + + self.token_source.lookahead_nth(n).kind + } + + pub(crate) fn at(&self, kind: SyntaxKind) -> bool { + // currently we don't need support for composite tokens (e.g. `>>`) + self.token_source.lookahead_nth(0).kind == kind + } + + pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool { + if !self.at(kind) { + return false; + } + + // currently we don't need support for composite tokens (e.g. `>>`) + let n_raw_tokens = 1; + self.do_bump(kind, n_raw_tokens); + true + } + + pub(crate) fn at_ts(&self, kinds: TokenSet) -> bool { + kinds.contains(self.current()) + } + + pub(crate) fn start(&mut self) -> Marker { + let pos = self.events.len() as u32; + self.push_event(Event::tombstone()); + Marker::new(pos) + } + + pub(crate) fn bump(&mut self, kind: SyntaxKind) { + assert!(self.eat(kind)); + } + + pub(crate) fn bump_any(&mut self) { + let kind = self.nth(0); + if kind == EOF { + return; + } + self.do_bump(kind, 1); + } + + fn do_bump(&mut self, kind: SyntaxKind, n_raw_tokens: u8) { + for _ in 0..n_raw_tokens { + self.token_source.bump(); + } + + self.push_event(Event::Token { kind, n_raw_tokens }); + } + + fn push_event(&mut self, event: Event) { + self.events.push(event); + } + + pub(crate) fn error>(&mut self, message: T) { + let msg = ParseError(Box::new(message.into())); + self.push_event(Event::Error { msg }); + } + + pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool { + if self.eat(kind) { + return true; + } + self.error(format!("expected {:?}", kind)); + false + } + + pub(crate) fn err_and_bump(&mut self, message: &str) { + self.err_recover(message, TokenSet::EMPTY) + } + + pub(crate) fn err_recover(&mut self, message: &str, recovery: TokenSet) { + match self.current() { + LBRACE | RBRACE => { + self.error(message); + return; + } + _ => (), + } + + if self.at_ts(recovery) { + self.error(message); + return; + } + + let m = self.start(); + self.error(message); + self.bump_any(); + m.complete(self, ERROR); + } +} + +pub(crate) struct Marker { + pos: u32, + bomb: DropBomb, +} + +impl Marker { + fn new(pos: u32) -> Marker { + Marker { + pos, + bomb: DropBomb::new("Marker must be either completed or abandoned"), + } + } + + pub(crate) fn complete(mut self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker { + self.bomb.defuse(); + let idx = self.pos as usize; + match &mut p.events[idx] { + Event::Start { kind: slot, .. } => { + *slot = kind; + } + _ => unreachable!(), + } + p.push_event(Event::Finish); + CompletedMarker::new(self.pos, kind) + } + + pub(crate) fn abandon(mut self, p: &mut Parser) { + self.bomb.defuse(); + let idx = self.pos as usize; + if idx == p.events.len() - 1 { + match p.events.pop() { + Some(Event::Start { + kind: TOMBSTONE, + forward_parent: None, + }) => (), + _ => unreachable!(), + } + } + } +} + +pub(crate) struct CompletedMarker { + pos: u32, + kind: SyntaxKind, +} + +impl CompletedMarker { + fn new(pos: u32, kind: SyntaxKind) -> Self { + CompletedMarker { pos, kind } + } + + pub(crate) fn precede(self, p: &mut Parser) -> Marker { + let new_pos = p.start(); + let idx = self.pos as usize; + match &mut p.events[idx] { + Event::Start { forward_parent, .. } => { + *forward_parent = Some(new_pos.pos - self.pos); + } + _ => unreachable!(), + } + new_pos + } + + pub(crate) fn extend_to(self, p: &mut Parser, mut m: Marker) -> CompletedMarker { + m.bomb.defuse(); + let idx = m.pos as usize; + match &mut p.events[idx] { + Event::Start { forward_parent, .. } => { + *forward_parent = Some(self.pos - m.pos); + } + _ => unreachable!(), + } + self + } + + pub(crate) fn kind(&self) -> SyntaxKind { + self.kind + } +} diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index 12b51cf..89cd957 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -25,6 +25,7 @@ pub enum SyntaxKind { WHITESPACE, COMMENT, ERROR, + SOURCE_FILE, __LAST, } diff --git a/src/parser/token_set.rs b/src/parser/token_set.rs new file mode 100644 index 0000000..fe3d907 --- /dev/null +++ b/src/parser/token_set.rs @@ -0,0 +1,26 @@ +use crate::parser::SyntaxKind; + +#[derive(Clone, Copy)] +pub(crate) struct TokenSet(u128); + +impl TokenSet { + pub(crate) const EMPTY: TokenSet = TokenSet(0); + + pub(crate) const fn new(kinds: &[SyntaxKind]) -> TokenSet { + let mut res = 0u128; + let mut i = 0; + while i < kinds.len() { + res |= mask(kinds[i]); + i += 1; + } + TokenSet(res) + } + + pub(crate) const fn contains(&self, kind: SyntaxKind) -> bool { + self.0 & mask(kind) != 0 + } +} + +const fn mask(kind: SyntaxKind) -> u128 { + 1u128 << (kind as usize) +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index e1b517f..0ebb190 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1,5 +1,5 @@ +pub mod ast; pub mod syntax_error; pub mod syntax_node; pub mod text_token_source; pub mod text_tree_sink; -pub mod ast; From 8d6853430fa500e0dd351abe4db6738a43584c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Wed, 10 Jan 2024 13:24:26 +0100 Subject: [PATCH 07/20] fix infinite loop problem and parse everything into source file --- src/parser/grammar.rs | 7 +++++++ src/parser/parser.rs | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index f58c56c..d586fe3 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1,3 +1,5 @@ +use std::process::exit; + use crate::parser::{ parser::{CompletedMarker, Marker, Parser}, token_set::TokenSet, @@ -6,5 +8,10 @@ use crate::parser::{ pub(crate) fn parse_source_file(p: &mut Parser) { let m = p.start(); + + while !p.at(EOF) { + println!("{:?}", p.current()); + p.bump_any(); + } m.complete(p, SOURCE_FILE); } diff --git a/src/parser/parser.rs b/src/parser/parser.rs index 9e96576..53b1e49 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -37,7 +37,7 @@ impl<'t> Parser<'t> { assert!(n < 3); let steps = self.steps.get(); - assert!(steps >= 10_000_000, "infinite loop detected"); + assert!(steps <= 10000000, "infinite loop detected"); self.steps.set(steps + 1); self.token_source.lookahead_nth(n).kind From 1fb92a690a9b46c354b5506be252fb95bc5528a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 15 Jan 2024 14:57:35 +0100 Subject: [PATCH 08/20] rule parsing WiP --- .gitignore | 1 + src/main.rs | 4 +++- src/parser/grammar.rs | 18 +++++++++++---- src/parser/grammar/items.rs | 46 +++++++++++++++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 src/parser/grammar/items.rs diff --git a/.gitignore b/.gitignore index ea8c4bf..ccb5166 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +.vscode \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index e97e613..bd701fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -39,9 +39,11 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { println!("{:?}", tree); println!("{:?}", syntax_tree); + println!(""); - for child in tree.children() { + for child in syntax_tree.children() { println!("{:?}", child.kind()); + println!("{:?}", child.green().children()); } (tree, parser_errors) diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index d586fe3..042ec95 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1,4 +1,4 @@ -use std::process::exit; +mod items; use crate::parser::{ parser::{CompletedMarker, Marker, Parser}, @@ -9,9 +9,19 @@ use crate::parser::{ pub(crate) fn parse_source_file(p: &mut Parser) { let m = p.start(); - while !p.at(EOF) { - println!("{:?}", p.current()); + items::mod_content(p, false); + m.complete(p, SOURCE_FILE); +} + +fn error_block(p: &mut Parser, message: &str) { + assert!(p.at(LBRACE)); + let m = p.start(); + p.error(message); + p.bump(LBRACE); + // Change this to parse expression content + while !p.at(RBRACE) { p.bump_any(); } - m.complete(p, SOURCE_FILE); + p.eat(RBRACE); + m.complete(p, ERROR); } diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs new file mode 100644 index 0000000..ae2b1cf --- /dev/null +++ b/src/parser/grammar/items.rs @@ -0,0 +1,46 @@ +use super::*; + +pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { + while !p.at(EOF) && !(p.at(RBRACE) && stop_on_r_brace) { + import_or_rule(p, stop_on_r_brace); + } +} + +// So far in this prototype, we only have one kind of item: a rule. +// In the future, also imports will be supported here +pub(super) fn import_or_rule(p: &mut Parser, stop_on_r_brace: bool) { + let m = p.start(); + let m = match opt_rule(p, m) { + Ok(()) => { + return; + } + Err(m) => m, + }; + m.abandon(p); + match p.current() { + LBRACE => { + error_block(p, "expected an item"); + } + RBRACE if !stop_on_r_brace => { + let e = p.start(); + p.error("unmatched }"); + p.bump(RBRACE); + e.complete(p, ERROR); + } + EOF | RBRACE => p.error("expected an item"), + _ => p.err_and_bump("expected an item"), + } +} + +pub(super) fn opt_rule(p: &mut Parser, m: Marker) -> Result<(), Marker> { + match p.current() { + RULE => rule(p, m), + _ => return Err(m), + } + Ok(()) +} + +fn rule(p: &mut Parser, m: Marker) { + p.bump(RULE); + m.complete(p, RULE); +} From dc8491548ea7cadc1b2e1adbe348d603f819280e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Wed, 17 Jan 2024 10:02:15 +0100 Subject: [PATCH 09/20] add block expression parsing --- Cargo.lock | 2 +- Cargo.toml | 2 +- example.yar | 5 +++- src/main.rs | 40 +++++++++++++++++++++-------- src/parser/grammar.rs | 17 ++++++++++--- src/parser/grammar/expressions.rs | 42 +++++++++++++++++++++++++++++++ src/parser/grammar/items.rs | 9 +++++++ src/parser/syntaxkind.rs | 1 + src/syntax/text_tree_sink.rs | 5 ++-- 9 files changed, 103 insertions(+), 20 deletions(-) create mode 100644 src/parser/grammar/expressions.rs diff --git a/Cargo.lock b/Cargo.lock index e1ac85c..8591c6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -106,7 +106,7 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "rowan-test" version = "0.1.0" -source = "git+https://github.com/TommYDeeee/rowan-test.git#8a8c7aa1bdd6905c508e848a389cbe5237f332dd" +source = "git+https://github.com/avast/avast-rowan.git#357157c01d3bf543b22c70bba1b91fa7e19d8498" dependencies = [ "countme", "hashbrown", diff --git a/Cargo.toml b/Cargo.toml index 8b7b767..594d4bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,6 @@ edition = "2021" [dependencies] logos = "0.13.0" -rowan-test = { git = "https://github.com/TommYDeeee/rowan-test.git" } +rowan-test = { git = "https://github.com/avast/avast-rowan.git" } text-size = "1.1.1" drop_bomb = "0.1.5" diff --git a/example.yar b/example.yar index 25ce9d9..12f2d0b 100644 --- a/example.yar +++ b/example.yar @@ -1,8 +1,11 @@ //Global comment //Rule comment -rule foo +rule test { + //Rule block comment + + //String comment strings: $a = "foo" $b = "bar" diff --git a/src/main.rs b/src/main.rs index bd701fc..cf5820f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,11 +3,11 @@ /// as we want full fidelity and error resilience.; use std::{env::args, fs, path::Path}; -use rowan_test::GreenNode; +use rowan_test::{GreenNode, NodeOrToken}; use crate::lexer::tokenize; -use crate::parser::{TokenSource, TreeSink}; -use crate::syntax::syntax_node::SyntaxNode; +use crate::parser::{SyntaxKind, TokenSource, TreeSink}; +use crate::syntax::syntax_node::{SyntaxElement, SyntaxNode}; use crate::syntax::{ syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, }; @@ -29,7 +29,6 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { let (tokens, lexer_errors) = tokenize(text); let mut token_source = TextTokenSource::new(text, &tokens); let mut tree_sink = TextTreeSink::new(text, &tokens); - println!("{:?}", tokens); parser::parse(&mut token_source, &mut tree_sink); let (tree, mut parser_errors) = tree_sink.finish(); @@ -37,14 +36,33 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { let syntax_tree = SyntaxNode::new_root(tree.clone()); - println!("{:?}", tree); - println!("{:?}", syntax_tree); - println!(""); + println!("Tokens: \n{:?}", tokens); + println!(); + println!("Errors: \n{:?}", parser_errors); + println!(); - for child in syntax_tree.children() { - println!("{:?}", child.kind()); - println!("{:?}", child.green().children()); - } + let indent = 0; + print(indent, syntax_tree.into()); + //for child in syntax_tree.children() { + // print!("{:indent$}", "", indent = indent); + // println!("{:?}", child.kind()); + // println!("{:?}", child.green().children()); + //} (tree, parser_errors) } + +fn print(indent: usize, element: SyntaxElement) { + let kind: SyntaxKind = element.kind().into(); + print!("{:indent$}", "", indent = indent); + match element { + NodeOrToken::Node(node) => { + println!("- {:?}", kind); + for child in node.children_with_tokens() { + print(indent + 2, child); + } + } + + NodeOrToken::Token(token) => println!("- {:?} {:?}", token.text(), kind), + } +} diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index 042ec95..1268919 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1,6 +1,8 @@ +mod expressions; mod items; use crate::parser::{ + grammar::expressions::rule_body, parser::{CompletedMarker, Marker, Parser}, token_set::TokenSet, SyntaxKind::{self, *}, @@ -18,10 +20,17 @@ fn error_block(p: &mut Parser, message: &str) { let m = p.start(); p.error(message); p.bump(LBRACE); - // Change this to parse expression content - while !p.at(RBRACE) { - p.bump_any(); - } + rule_body(p); p.eat(RBRACE); m.complete(p, ERROR); } + +fn name_r(p: &mut Parser<'_>, recovery: TokenSet) { + if p.at(IDENTIFIER) { + let m = p.start(); + p.bump(IDENTIFIER); + m.complete(p, IDENTIFIER); + } else { + p.err_recover("expected a name", recovery); + } +} diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs new file mode 100644 index 0000000..605c1c6 --- /dev/null +++ b/src/parser/grammar/expressions.rs @@ -0,0 +1,42 @@ +use super::*; + +pub(crate) fn block_expr(p: &mut Parser) { + if !p.at(LBRACE) { + p.error("expected a block expression"); + return; + } + let m = p.start(); + p.bump(LBRACE); + rule_body(p); + p.expect(RBRACE); + m.complete(p, BLOCK_EXPR); +} + +pub(super) fn rule_body(p: &mut Parser) { + while !p.at(EOF) && !p.at(RBRACE) { + match p.current() { + // add metadata later + STRINGS => strings(p), + CONDITION => condition(p), + _ => { + p.err_and_bump("expected strings or condition"); + } + } + } +} + +fn strings(p: &mut Parser) { + assert!(p.at(STRINGS)); + let m = p.start(); + p.bump(STRINGS); + p.expect(COLON); + m.complete(p, STRINGS); +} + +fn condition(p: &mut Parser) { + assert!(p.at(CONDITION)); + let m = p.start(); + p.bump(CONDITION); + p.expect(COLON); + m.complete(p, CONDITION); +} diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs index ae2b1cf..8b1aaf9 100644 --- a/src/parser/grammar/items.rs +++ b/src/parser/grammar/items.rs @@ -1,5 +1,12 @@ use super::*; +pub(super) const RULE_RECOVERY_SET: TokenSet = TokenSet::new( + // Add import here when it is supported + &[ + RULE, // rule + ], +); + pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { while !p.at(EOF) && !(p.at(RBRACE) && stop_on_r_brace) { import_or_rule(p, stop_on_r_brace); @@ -42,5 +49,7 @@ pub(super) fn opt_rule(p: &mut Parser, m: Marker) -> Result<(), Marker> { fn rule(p: &mut Parser, m: Marker) { p.bump(RULE); + name_r(p, RULE_RECOVERY_SET); + expressions::block_expr(p); m.complete(p, RULE); } diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index 89cd957..e6112d7 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -26,6 +26,7 @@ pub enum SyntaxKind { COMMENT, ERROR, SOURCE_FILE, + BLOCK_EXPR, __LAST, } diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs index e403b79..264a6ce 100644 --- a/src/syntax/text_tree_sink.rs +++ b/src/syntax/text_tree_sink.rs @@ -59,7 +59,8 @@ impl<'a> TreeSink for TextTreeSink<'a> { .take_while(|it| it.kind.is_trivia()) .count(); let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; - let mut trivia_end = leading_trivias.iter().map(|it| it.len).sum::(); + let mut trivia_end = + self.text_pos + leading_trivias.iter().map(|it| it.len).sum::(); let n_attached_trivias = { let leading_trivias = leading_trivias.iter().rev().map(|it| { @@ -145,7 +146,7 @@ fn n_attached_trivias<'a>( trivias: impl Iterator, ) -> usize { match kind { - SyntaxKind::RULE => { + SyntaxKind::RULE | SyntaxKind::BLOCK_EXPR | SyntaxKind::STRINGS | SyntaxKind::CONDITION => { let mut res = 0; let mut trivias = trivias.enumerate().peekable(); From e5ea692b2a0cce331630ee57b29ced234aad55a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 22 Jan 2024 14:17:48 +0100 Subject: [PATCH 10/20] add support for condition statements --- src/parser/grammar/expressions.rs | 104 +++++++++++++++++++++++++ src/parser/grammar/expressions/atom.rs | 23 ++++++ src/parser/grammar/items.rs | 15 ++-- src/parser/syntaxkind.rs | 4 + 4 files changed, 140 insertions(+), 6 deletions(-) create mode 100644 src/parser/grammar/expressions/atom.rs diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index 605c1c6..c47c99d 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -1,3 +1,5 @@ +mod atom; + use super::*; pub(crate) fn block_expr(p: &mut Parser) { @@ -30,6 +32,7 @@ fn strings(p: &mut Parser) { let m = p.start(); p.bump(STRINGS); p.expect(COLON); + strings_body(p); m.complete(p, STRINGS); } @@ -38,5 +41,106 @@ fn condition(p: &mut Parser) { let m = p.start(); p.bump(CONDITION); p.expect(COLON); + condition_body(p); m.complete(p, CONDITION); } + +pub(super) fn strings_body(p: &mut Parser) { + // add support for meta also + while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { + assert!(p.at(VARIABLE)); + let m = p.start(); + p.bump(VARIABLE); + p.expect(ASSIGN); + // so far only strings are supported, later add match for hex strings and regex + string(p); + m.complete(p, VARIABLE); + } +} + +// do the same for hex and regex strings +fn string(p: &mut Parser) { + assert!(p.at(STRING)); + let m = p.start(); + p.bump(STRING); + // add plain string modifiers + m.complete(p, STRING); +} + +pub(super) fn condition_body(p: &mut Parser) { + // add support for meta also + while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { + let m = p.start(); + if let Some(cm) = expression(p, Some(m), 1) { + let m = cm.precede(p); + m.complete(p, EXPRESSION_STMT); + } + } +} + +enum Associativity { + Left, + Right, +} + +/// Binding powers of operators for a Pratt parser. +fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { + match p.current() { + // add support for other operators + AND => (4, AND, Associativity::Left), + OR => (3, OR, Associativity::Left), + _ => (0, ERROR, Associativity::Left), + } +} + +fn expression(p: &mut Parser, m: Option, bp: u8) -> Option { + let m = m.unwrap_or_else(|| p.start()); + let mut lhs = match lhs(p) { + Some(lhs) => { + let lhs = lhs.extend_to(p, m); + lhs + } + None => { + m.abandon(p); + return None; + } + }; + + loop { + let (op_bp, op, associativity) = current_op(p); + if op_bp < bp { + break; + } + let m = lhs.precede(p); + p.bump(op); + + let op_bp = match associativity { + Associativity::Left => op_bp + 1, + Associativity::Right => op_bp, + }; + expression(p, None, op_bp); + lhs = m.complete(p, EXPRESSION); + } + Some(lhs) +} + +fn lhs(p: &mut Parser) -> Option { + let m; + let kind = match p.current() { + // unary operators + NOT => { + m = p.start(); + p.bump_any(); + PREFIX_EXPR + } + // all other operators + _ => { + let lhs = atom::atom_expr(p)?; + return Some(lhs); + } + }; + // parse unary operators interior + expression(p, None, 255); + let cm = m.complete(p, kind); + Some(cm) +} diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs new file mode 100644 index 0000000..8e383ae --- /dev/null +++ b/src/parser/grammar/expressions/atom.rs @@ -0,0 +1,23 @@ +use super::*; + +// So far the only literals we support are true, false and variables +// numbers will be added later +pub(crate) const LITERAL_FIRST: TokenSet = TokenSet::new(&[TRUE, FALSE, VARIABLE]); + +pub(crate) fn literal(p: &mut Parser) -> Option { + if !p.at_ts(LITERAL_FIRST) { + return None; + } + let m = p.start(); + p.bump_any(); + Some(m.complete(p, LITERAL)) +} + +// add support for while/for loops, if/else statements, etc. +pub(super) fn atom_expr(p: &mut Parser) -> Option { + if let Some(m) = literal(p) { + return Some(m); + } else { + todo!("add support for other atoms") + } +} diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs index 8b1aaf9..c344898 100644 --- a/src/parser/grammar/items.rs +++ b/src/parser/grammar/items.rs @@ -9,15 +9,14 @@ pub(super) const RULE_RECOVERY_SET: TokenSet = TokenSet::new( pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { while !p.at(EOF) && !(p.at(RBRACE) && stop_on_r_brace) { - import_or_rule(p, stop_on_r_brace); + process_top_level(p, stop_on_r_brace); } } -// So far in this prototype, we only have one kind of item: a rule. -// In the future, also imports will be supported here -pub(super) fn import_or_rule(p: &mut Parser, stop_on_r_brace: bool) { +// process either rule, import or include +pub(super) fn process_top_level(p: &mut Parser, stop_on_r_brace: bool) { let m = p.start(); - let m = match opt_rule(p, m) { + let m = match opt_rule_import_include(p, m) { Ok(()) => { return; } @@ -39,7 +38,10 @@ pub(super) fn import_or_rule(p: &mut Parser, stop_on_r_brace: bool) { } } -pub(super) fn opt_rule(p: &mut Parser, m: Marker) -> Result<(), Marker> { +// So far in this prototype, we only have one kind of item: a rule. +// In the future, also imports and includes will be supported here +pub(super) fn opt_rule_import_include(p: &mut Parser, m: Marker) -> Result<(), Marker> { + // add rule modifiers to match current and lookahead next with p.nth(1) for RULE or ERROR match p.current() { RULE => rule(p, m), _ => return Err(m), @@ -50,6 +52,7 @@ pub(super) fn opt_rule(p: &mut Parser, m: Marker) -> Result<(), Marker> { fn rule(p: &mut Parser, m: Marker) { p.bump(RULE); name_r(p, RULE_RECOVERY_SET); + // add optional support for rule tags expressions::block_expr(p); m.complete(p, RULE); } diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index e6112d7..36c5eeb 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -27,6 +27,10 @@ pub enum SyntaxKind { ERROR, SOURCE_FILE, BLOCK_EXPR, + PREFIX_EXPR, + LITERAL, + EXPRESSION, + EXPRESSION_STMT, __LAST, } From 01838ffc66303f5125e2f49dccd30626261c666c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 22 Jan 2024 15:12:25 +0100 Subject: [PATCH 11/20] fix clippy warnings --- src/main.rs | 2 +- src/parser/grammar/expressions.rs | 5 +---- src/parser/grammar/expressions/atom.rs | 2 +- src/parser/grammar/items.rs | 2 +- src/parser/mod.rs | 3 ++- src/parser/parser.rs | 2 +- src/parser/syntaxkind.rs | 2 ++ src/syntax/syntax_node.rs | 2 +- src/syntax/text_tree_sink.rs | 4 ++-- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main.rs b/src/main.rs index cf5820f..db7348c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -53,7 +53,7 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { } fn print(indent: usize, element: SyntaxElement) { - let kind: SyntaxKind = element.kind().into(); + let kind: SyntaxKind = element.kind(); print!("{:indent$}", "", indent = indent); match element { NodeOrToken::Node(node) => { diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index c47c99d..83f038c 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -96,10 +96,7 @@ fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { fn expression(p: &mut Parser, m: Option, bp: u8) -> Option { let m = m.unwrap_or_else(|| p.start()); let mut lhs = match lhs(p) { - Some(lhs) => { - let lhs = lhs.extend_to(p, m); - lhs - } + Some(lhs) => lhs.extend_to(p, m), None => { m.abandon(p); return None; diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index 8e383ae..ae0dc39 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -16,7 +16,7 @@ pub(crate) fn literal(p: &mut Parser) -> Option { // add support for while/for loops, if/else statements, etc. pub(super) fn atom_expr(p: &mut Parser) -> Option { if let Some(m) = literal(p) { - return Some(m); + Some(m) } else { todo!("add support for other atoms") } diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs index c344898..ff500ad 100644 --- a/src/parser/grammar/items.rs +++ b/src/parser/grammar/items.rs @@ -8,7 +8,7 @@ pub(super) const RULE_RECOVERY_SET: TokenSet = TokenSet::new( ); pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { - while !p.at(EOF) && !(p.at(RBRACE) && stop_on_r_brace) { + while !(p.at(EOF) || p.at(RBRACE) && stop_on_r_brace) { process_top_level(p, stop_on_r_brace); } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 1cdc340..5585781 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3,13 +3,14 @@ pub mod syntaxkind; pub use syntaxkind::SyntaxKind; mod event; mod grammar; +#[allow(clippy::module_inception)] mod parser; mod token_set; use grammar::parse_source_file; #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ParseError(pub Box); +pub struct ParseError(pub String); pub trait TokenSource { fn current(&self) -> Token; diff --git a/src/parser/parser.rs b/src/parser/parser.rs index 53b1e49..ab1dd00 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -94,7 +94,7 @@ impl<'t> Parser<'t> { } pub(crate) fn error>(&mut self, message: T) { - let msg = ParseError(Box::new(message.into())); + let msg = ParseError(message.into()); self.push_event(Event::Error { msg }); } diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index 36c5eeb..fea3024 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -1,3 +1,5 @@ +#![allow(clippy::upper_case_acronyms)] + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] #[repr(u16)] pub enum SyntaxKind { diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs index a84a52d..8d762d0 100644 --- a/src/syntax/syntax_node.rs +++ b/src/syntax/syntax_node.rs @@ -63,6 +63,6 @@ impl SyntaxTreeBuilder { pub fn error(&mut self, error: parser::ParseError, text_pos: TextSize) { self.errors - .push(SyntaxError::new_at_offset(*error.0, text_pos)) + .push(SyntaxError::new_at_offset(error.0, text_pos)) } } diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs index 264a6ce..63f2992 100644 --- a/src/syntax/text_tree_sink.rs +++ b/src/syntax/text_tree_sink.rs @@ -148,9 +148,9 @@ fn n_attached_trivias<'a>( match kind { SyntaxKind::RULE | SyntaxKind::BLOCK_EXPR | SyntaxKind::STRINGS | SyntaxKind::CONDITION => { let mut res = 0; - let mut trivias = trivias.enumerate().peekable(); + let trivias = trivias.enumerate().peekable(); - while let Some((i, (kind, text))) = trivias.next() { + for (i, (kind, text)) in trivias { match kind { SyntaxKind::WHITESPACE if text.contains("\n\n") => { break; From 967b6154657b40767e6241903f0776322c17bd2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Tue, 23 Jan 2024 10:48:19 +0100 Subject: [PATCH 12/20] add better variable support --- example.yar | 4 ++-- src/parser/grammar/expressions.rs | 23 ++++++++++++++++------- src/parser/grammar/expressions/atom.rs | 13 ++++++++++--- src/parser/syntaxkind.rs | 1 + 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/example.yar b/example.yar index 12f2d0b..9f68a9b 100644 --- a/example.yar +++ b/example.yar @@ -10,6 +10,6 @@ rule test $a = "foo" $b = "bar" condition: - $a and - $b + $a or + $b and true } diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index 83f038c..da80b41 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -45,25 +45,34 @@ fn condition(p: &mut Parser) { m.complete(p, CONDITION); } +const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE]); + pub(super) fn strings_body(p: &mut Parser) { // add support for meta also while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { - assert!(p.at(VARIABLE)); let m = p.start(); - p.bump(VARIABLE); + if p.at(VARIABLE) { + let m = p.start(); + p.bump(VARIABLE); + m.complete(p, VARIABLE); + } else { + p.err_recover("expected a variable", VARIABLE_RECOVERY_SET); + } p.expect(ASSIGN); // so far only strings are supported, later add match for hex strings and regex string(p); - m.complete(p, VARIABLE); + m.complete(p, VARIABLE_STMT); } } -// do the same for hex and regex strings +// add support for hex and regex strings later on fn string(p: &mut Parser) { - assert!(p.at(STRING)); let m = p.start(); - p.bump(STRING); - // add plain string modifiers + match p.current() { + STRING => p.bump(STRING), + _ => p.err_and_bump("expected a string"), + } + // add string modifiers m.complete(p, STRING); } diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index ae0dc39..4a5fab0 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -13,11 +13,18 @@ pub(crate) fn literal(p: &mut Parser) -> Option { Some(m.complete(p, LITERAL)) } +const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE]); + // add support for while/for loops, if/else statements, etc. pub(super) fn atom_expr(p: &mut Parser) -> Option { if let Some(m) = literal(p) { - Some(m) - } else { - todo!("add support for other atoms") + return Some(m); } + + let _done = match p.current() { + _ => { + p.err_recover("expected expression", EXPR_RECOVERY_SET); + return None; + } + }; } diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index fea3024..fc47c2c 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -33,6 +33,7 @@ pub enum SyntaxKind { LITERAL, EXPRESSION, EXPRESSION_STMT, + VARIABLE_STMT, __LAST, } From 4af9f97f188a003557f77da9665df56d67cc2f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Tue, 23 Jan 2024 10:53:25 +0100 Subject: [PATCH 13/20] fix clippy warnings --- src/parser/grammar/expressions/atom.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index 4a5fab0..2131b22 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -21,7 +21,7 @@ pub(super) fn atom_expr(p: &mut Parser) -> Option { return Some(m); } - let _done = match p.current() { + match p.current() { _ => { p.err_recover("expected expression", EXPR_RECOVERY_SET); return None; From 6505cdb5fb8bc1b72c2589f3bff1ba0c56070c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Tue, 23 Jan 2024 11:13:28 +0100 Subject: [PATCH 14/20] suppress clippy warnings --- src/parser/grammar/expressions/atom.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index 2131b22..f425d7d 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -21,9 +21,12 @@ pub(super) fn atom_expr(p: &mut Parser) -> Option { return Some(m); } + // This will be extended to support more expressions later + #[allow(clippy::match_single_binding)] match p.current() { _ => { p.err_recover("expected expression", EXPR_RECOVERY_SET); + #[allow(clippy::needless_return)] return None; } }; From 21a29dac02cc3769464fb3f6a0aa5ca38e1681da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Thu, 25 Jan 2024 11:45:07 +0100 Subject: [PATCH 15/20] add tests --- Cargo.lock | 425 +++++++++++++++++++++++++++++- Cargo.toml | 4 + src/main.rs | 65 ++++- src/parser/grammar/expressions.rs | 3 +- src/parser/parser.rs | 9 +- tests/test1.in | 7 + tests/test1.out | 33 +++ tests/test2.in | 9 + tests/test2.out | 48 ++++ tests/test3.in | 15 ++ tests/test3.out | 62 +++++ tests/test4.err | 1 + tests/test4.in | 15 ++ tests/test4.out | 62 +++++ tests/test5.err | 1 + tests/test5.in | 15 ++ tests/test5.out | 62 +++++ tests/test6.err | 13 + tests/test6.in | 15 ++ tests/test6.out | 60 +++++ 20 files changed, 902 insertions(+), 22 deletions(-) create mode 100644 tests/test1.in create mode 100644 tests/test1.out create mode 100644 tests/test2.in create mode 100644 tests/test2.out create mode 100644 tests/test3.in create mode 100644 tests/test3.out create mode 100644 tests/test4.err create mode 100644 tests/test4.in create mode 100644 tests/test4.out create mode 100644 tests/test5.err create mode 100644 tests/test5.in create mode 100644 tests/test5.out create mode 100644 tests/test6.err create mode 100644 tests/test6.in create mode 100644 tests/test6.out diff --git a/Cargo.lock b/Cargo.lock index 8591c6a..8094f17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -14,30 +23,204 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata 0.1.10", +] + +[[package]] +name = "bstr" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "windows-sys", +] + [[package]] name = "countme" version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7704b5fdd17b18ae31c4c1da5a2e0305a2bf17b5249300a9ee9ed7b72114c636" +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + [[package]] name = "drop_bomb" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bda8e21c04aca2ae33ffc2fd8c23134f3cac46db123ba97bd9d3f3b8a4a85e1" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "globset" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +dependencies = [ + "aho-corasick", + "bstr 1.9.0", + "log", + "regex-automata 0.4.4", + "regex-syntax 0.8.2", +] + +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags 2.4.2", + "ignore", + "walkdir", +] + +[[package]] +name = "goldenfile" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a67453a3b358bd8213aedafd4feed75eecab9fb04bed26ba6fdf94694be560" +dependencies = [ + "scopeguard", + "similar-asserts", + "tempfile", + "yansi", +] + [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +[[package]] +name = "ignore" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b46810df39e66e925525d6e38ce1e7f6e1d208f72dc39757880fcb66e2c58af1" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.4", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + [[package]] name = "logos" version = "0.13.0" @@ -57,7 +240,7 @@ dependencies = [ "fnv", "proc-macro2", "quote", - "regex-syntax", + "regex-syntax 0.6.29", "syn", ] @@ -70,6 +253,12 @@ dependencies = [ "logos-codegen", ] +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + [[package]] name = "memoffset" version = "0.9.0" @@ -97,12 +286,44 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-automata" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + [[package]] name = "regex-syntax" version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "rowan-test" version = "0.1.0" @@ -121,6 +342,74 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustix" +version = "0.38.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "similar" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21" +dependencies = [ + "bstr 0.2.17", + "unicode-segmentation", +] + +[[package]] +name = "similar-asserts" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e041bb827d1bfca18f213411d51b665309f1afb37a04a5d1464530e13779fc0f" +dependencies = [ + "console", + "similar", +] + [[package]] name = "syn" version = "2.0.46" @@ -132,6 +421,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + [[package]] name = "text-size" version = "1.1.1" @@ -144,11 +446,132 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "yansi" +version = "1.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377" + [[package]] name = "yara-parser" version = "0.1.0" dependencies = [ "drop_bomb", + "globwalk", + "goldenfile", "logos", "rowan-test", "text-size", diff --git a/Cargo.toml b/Cargo.toml index 594d4bc..2a3d280 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,7 @@ logos = "0.13.0" rowan-test = { git = "https://github.com/avast/avast-rowan.git" } text-size = "1.1.1" drop_bomb = "0.1.5" + +[dev-dependencies] +goldenfile = "1.6.0" +globwalk = "0.9.1" diff --git a/src/main.rs b/src/main.rs index db7348c..6bd6dda 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ /// This library is used to create a parser for YARA language /// It should provide also token for whitespaces /// as we want full fidelity and error resilience.; -use std::{env::args, fs, path::Path}; +use std::{env::args, fs, io::Write, path::Path}; use rowan_test::{GreenNode, NodeOrToken}; @@ -42,27 +42,68 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { println!(); let indent = 0; - print(indent, syntax_tree.into()); - //for child in syntax_tree.children() { - // print!("{:indent$}", "", indent = indent); - // println!("{:?}", child.kind()); - // println!("{:?}", child.green().children()); - //} + let result = print(indent, syntax_tree.into()); + + print!("{}", result); (tree, parser_errors) } -fn print(indent: usize, element: SyntaxElement) { +fn print(indent: usize, element: SyntaxElement) -> String { + let mut result = String::new(); let kind: SyntaxKind = element.kind(); - print!("{:indent$}", "", indent = indent); + result.push_str(&format!("{:indent$}", "", indent = indent)); match element { NodeOrToken::Node(node) => { - println!("- {:?}", kind); + result.push_str(&format!("- {:?}\n", kind)); for child in node.children_with_tokens() { - print(indent + 2, child); + result.push_str(&print(indent + 2, child)); } } - NodeOrToken::Token(token) => println!("- {:?} {:?}", token.text(), kind), + NodeOrToken::Token(token) => { + result.push_str(&format!("- {:?} {:?}\n", token.text(), kind)); + } + } + result +} + +#[test] +fn test_parse_text() { + let mut mint = goldenfile::Mint::new("."); + + for entry in globwalk::glob("tests/*.in").unwrap().flatten() { + // Path to the .in.zip file. + let path = entry.into_path(); + let display_path = path.display(); + + let input = fs::read_to_string(&path) + .unwrap_or_else(|_| panic!("Failed to read input file {:?}", display_path)); + + let (tree, errors) = parse_text(&input); + + let out_path = path.with_extension("").with_extension("out"); + let syntax_tree = SyntaxNode::new_root(tree.clone()); + + let output = print(0, syntax_tree.into()); + + let mut output_file = mint.new_goldenfile(out_path).unwrap(); + + write!(output_file, "{}", output).unwrap(); + + // Check errors + let err_path = path.with_extension("").with_extension("err"); + if err_path.exists() { + let expected_errors = fs::read_to_string(&err_path) + .unwrap_or_else(|_| panic!("Failed to read error file {:?}", err_path.display())); + let actual_errors = errors + .iter() + .map(|error| format!("{:?}", error)) + .collect::>() + .join("\n"); + assert_eq!(actual_errors, expected_errors); + } else { + assert!(errors.is_empty(), "Unexpected errors: {:?}", errors); + } } } diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index da80b41..d347869 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -45,7 +45,8 @@ fn condition(p: &mut Parser) { m.complete(p, CONDITION); } -const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE]); +const VARIABLE_RECOVERY_SET: TokenSet = + TokenSet::new(&[VARIABLE, CONDITION, STRINGS, ASSIGN, RBRACE]); pub(super) fn strings_body(p: &mut Parser) { // add support for meta also diff --git a/src/parser/parser.rs b/src/parser/parser.rs index ab1dd00..d6afde2 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -111,15 +111,8 @@ impl<'t> Parser<'t> { } pub(crate) fn err_recover(&mut self, message: &str, recovery: TokenSet) { - match self.current() { - LBRACE | RBRACE => { - self.error(message); - return; - } - _ => (), - } - if self.at_ts(recovery) { + println!("recovery: {:?}", self.current()); self.error(message); return; } diff --git a/tests/test1.in b/tests/test1.in new file mode 100644 index 0000000..69ed034 --- /dev/null +++ b/tests/test1.in @@ -0,0 +1,7 @@ +rule test +{ + strings: + $a = "foo" + condition: + $a +} diff --git a/tests/test1.out b/tests/test1.out new file mode 100644 index 0000000..f4c65e8 --- /dev/null +++ b/tests/test1.out @@ -0,0 +1,33 @@ +- SOURCE_FILE + - RULE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - STRINGS + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - LITERAL + - "$a" VARIABLE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test2.in b/tests/test2.in new file mode 100644 index 0000000..4e26293 --- /dev/null +++ b/tests/test2.in @@ -0,0 +1,9 @@ +rule test +{ + strings: + $a = "foo" + $b = "bar" + condition: + $a or + $b +} diff --git a/tests/test2.out b/tests/test2.out new file mode 100644 index 0000000..68899c1 --- /dev/null +++ b/tests/test2.out @@ -0,0 +1,48 @@ +- SOURCE_FILE + - RULE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - STRINGS + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - LITERAL + - "$b" VARIABLE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test3.in b/tests/test3.in new file mode 100644 index 0000000..9f68a9b --- /dev/null +++ b/tests/test3.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a or + $b and true +} diff --git a/tests/test3.out b/tests/test3.out new file mode 100644 index 0000000..1d407f9 --- /dev/null +++ b/tests/test3.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - EXPRESSION + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test4.err b/tests/test4.err new file mode 100644 index 0000000..bf9ebfc --- /dev/null +++ b/tests/test4.err @@ -0,0 +1 @@ +SyntaxError("expected a variable", 98..98) \ No newline at end of file diff --git a/tests/test4.in b/tests/test4.in new file mode 100644 index 0000000..8f0a414 --- /dev/null +++ b/tests/test4.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + a = "foo" + $b = "bar" + condition: + $a or + $b and true +} diff --git a/tests/test4.out b/tests/test4.out new file mode 100644 index 0000000..699f73c --- /dev/null +++ b/tests/test4.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - EXPRESSION + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test5.err b/tests/test5.err new file mode 100644 index 0000000..af68e68 --- /dev/null +++ b/tests/test5.err @@ -0,0 +1 @@ +SyntaxError("expected expression", 144..144) \ No newline at end of file diff --git a/tests/test5.in b/tests/test5.in new file mode 100644 index 0000000..75bfc9f --- /dev/null +++ b/tests/test5.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a or + b and true +} diff --git a/tests/test5.out b/tests/test5.out new file mode 100644 index 0000000..5753b77 --- /dev/null +++ b/tests/test5.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - ERROR + - "b" IDENTIFIER + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test6.err b/tests/test6.err new file mode 100644 index 0000000..b6080fd --- /dev/null +++ b/tests/test6.err @@ -0,0 +1,13 @@ +SyntaxError("expected a name", 38..38) +SyntaxError("expected strings or condition", 92..92) +SyntaxError("expected strings or condition", 98..98) +SyntaxError("expected strings or condition", 102..102) +SyntaxError("expected strings or condition", 104..104) +SyntaxError("expected strings or condition", 106..106) +SyntaxError("expected strings or condition", 114..114) +SyntaxError("expected strings or condition", 117..117) +SyntaxError("expected strings or condition", 119..119) +SyntaxError("expected expression", 139..139) +SyntaxError("expected expression", 141..141) +SyntaxError("expected expression", 150..150) +SyntaxError("Invalid character", 98..99) \ No newline at end of file diff --git a/tests/test6.in b/tests/test6.in new file mode 100644 index 0000000..cc3cb4e --- /dev/null +++ b/tests/test6.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule condition +{ + //Rule block comment + + //String comment + string* + a = 00000 + $b = "bar" + condition: + a ord + $b ant +} diff --git a/tests/test6.out b/tests/test6.out new file mode 100644 index 0000000..9aeedac --- /dev/null +++ b/tests/test6.out @@ -0,0 +1,60 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - ERROR + - "condition" CONDITION + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - ERROR + - "string" IDENTIFIER + - ERROR + - "*" ERROR + - "\n\t\t" WHITESPACE + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - ERROR + - "=" ASSIGN + - " " WHITESPACE + - ERROR + - "00000" NUMBER + - "\n\t\t" WHITESPACE + - ERROR + - "$b" VARIABLE + - " " WHITESPACE + - ERROR + - "=" ASSIGN + - " " WHITESPACE + - ERROR + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - ERROR + - "ord" IDENTIFIER + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - ERROR + - "ant" IDENTIFIER + - " \n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE From 978312789b31f646c1461a32bae0ec8930d9b85f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Thu, 25 Jan 2024 14:36:58 +0100 Subject: [PATCH 16/20] Add yara subset grammar and fix recovery sets --- src/parser/grammar/expressions.rs | 2 +- src/parser/grammar/expressions/atom.rs | 2 +- yara_subset.grammar | 12 ++++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 yara_subset.grammar diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index d347869..452300f 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -46,7 +46,7 @@ fn condition(p: &mut Parser) { } const VARIABLE_RECOVERY_SET: TokenSet = - TokenSet::new(&[VARIABLE, CONDITION, STRINGS, ASSIGN, RBRACE]); + TokenSet::new(&[VARIABLE]); pub(super) fn strings_body(p: &mut Parser) { // add support for meta also diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index f425d7d..18f227f 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -13,7 +13,7 @@ pub(crate) fn literal(p: &mut Parser) -> Option { Some(m.complete(p, LITERAL)) } -const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE]); +const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE, AND, OR, NOT]); // add support for while/for loops, if/else statements, etc. pub(super) fn atom_expr(p: &mut Parser) -> Option { diff --git a/yara_subset.grammar b/yara_subset.grammar new file mode 100644 index 0000000..9adf063 --- /dev/null +++ b/yara_subset.grammar @@ -0,0 +1,12 @@ +SOURCE -> RULE | eps. +RULE -> rule identifier lbrace RULEBODY rbrace. +RULEBODY -> STRINGS RULEBODY | CONDITION RULEBODY | eps. +STRINGS -> string colon STRINGSBODY. +CONDITION -> condition colon CONDITIONBODY. +STRINGSBODY -> variable assign string STRINGSBODY | eps. +CONDITIONBODY -> LITERAL CONDITIONBODY | OPERATOR CONDITIONBODY | eps. +LITERAL -> variable | BOOLEAN. +BOOLEAN -> true | false. +OPERATOR -> and | or | not. + +// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+%7C+CONDITION+%7C+eps.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+CONDITIONBODY.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0ACONDITIONBODY+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or+%7C+not. \ No newline at end of file From 22eedaf08f1ebd87c1d33c44b83292b2aa153a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Thu, 25 Jan 2024 14:43:17 +0100 Subject: [PATCH 17/20] fmt fix --- src/parser/grammar/expressions.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index 452300f..da80b41 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -45,8 +45,7 @@ fn condition(p: &mut Parser) { m.complete(p, CONDITION); } -const VARIABLE_RECOVERY_SET: TokenSet = - TokenSet::new(&[VARIABLE]); +const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE]); pub(super) fn strings_body(p: &mut Parser) { // add support for meta also From 9623d2bd1181f2e40e305e4646f4be0e3c8c0fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 29 Jan 2024 16:19:04 +0100 Subject: [PATCH 18/20] fix strings and conditions redefinition and grammar --- src/parser/grammar/expressions.rs | 21 +++++++++++++++++++-- src/parser/parser.rs | 1 - yara_subset.grammar | 7 ++++--- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index da80b41..ba715d7 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -15,11 +15,28 @@ pub(crate) fn block_expr(p: &mut Parser) { } pub(super) fn rule_body(p: &mut Parser) { + let mut has_strings = false; + let mut has_condition = false; while !p.at(EOF) && !p.at(RBRACE) { match p.current() { // add metadata later - STRINGS => strings(p), - CONDITION => condition(p), + STRINGS => { + if has_strings { + p.error("only one strings block is allowed"); + } + if has_condition { + p.error("strings block must come before condition block"); + } + strings(p); + has_strings = true; + } + CONDITION => { + if has_condition { + p.error("only one condition block is allowed"); + } + condition(p); + has_condition = true; + } _ => { p.err_and_bump("expected strings or condition"); } diff --git a/src/parser/parser.rs b/src/parser/parser.rs index d6afde2..a9fef81 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -112,7 +112,6 @@ impl<'t> Parser<'t> { pub(crate) fn err_recover(&mut self, message: &str, recovery: TokenSet) { if self.at_ts(recovery) { - println!("recovery: {:?}", self.current()); self.error(message); return; } diff --git a/yara_subset.grammar b/yara_subset.grammar index 9adf063..c558490 100644 --- a/yara_subset.grammar +++ b/yara_subset.grammar @@ -1,12 +1,13 @@ SOURCE -> RULE | eps. RULE -> rule identifier lbrace RULEBODY rbrace. -RULEBODY -> STRINGS RULEBODY | CONDITION RULEBODY | eps. +RULEBODY -> STRINGS CONDITION | CONDITION . STRINGS -> string colon STRINGSBODY. CONDITION -> condition colon CONDITIONBODY. STRINGSBODY -> variable assign string STRINGSBODY | eps. -CONDITIONBODY -> LITERAL CONDITIONBODY | OPERATOR CONDITIONBODY | eps. +CONDITIONBODY -> LITERAL EXPRESSION | OPERATOR EXPRESSION . +EXPRESSION -> LITERAL EXPRESSION | OPERATOR EXPRESSION | eps. LITERAL -> variable | BOOLEAN. BOOLEAN -> true | false. OPERATOR -> and | or | not. -// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+%7C+CONDITION+%7C+eps.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+CONDITIONBODY.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0ACONDITIONBODY+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or+%7C+not. \ No newline at end of file +// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+CONDITIONBODY.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0ACONDITIONBODY+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+.%0D%0AEXPRESSION+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or+%7C+not. \ No newline at end of file From 7edfc3fc452e8f94f06c40891e8cd60868bd3638 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 29 Jan 2024 16:36:02 +0100 Subject: [PATCH 19/20] add support for unary expression in grammar --- src/parser/grammar/expressions/atom.rs | 2 +- yara_subset.grammar | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index 18f227f..8a87972 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -13,7 +13,7 @@ pub(crate) fn literal(p: &mut Parser) -> Option { Some(m.complete(p, LITERAL)) } -const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE, AND, OR, NOT]); +const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE, NOT]); // add support for while/for loops, if/else statements, etc. pub(super) fn atom_expr(p: &mut Parser) -> Option { diff --git a/yara_subset.grammar b/yara_subset.grammar index c558490..7403344 100644 --- a/yara_subset.grammar +++ b/yara_subset.grammar @@ -2,12 +2,12 @@ SOURCE -> RULE | eps. RULE -> rule identifier lbrace RULEBODY rbrace. RULEBODY -> STRINGS CONDITION | CONDITION . STRINGS -> string colon STRINGSBODY. -CONDITION -> condition colon CONDITIONBODY. +CONDITION -> condition colon EXPRESSION. STRINGSBODY -> variable assign string STRINGSBODY | eps. -CONDITIONBODY -> LITERAL EXPRESSION | OPERATOR EXPRESSION . -EXPRESSION -> LITERAL EXPRESSION | OPERATOR EXPRESSION | eps. +EXPRESSION -> EXPRESSION OPERATOR EXPRESSION | NOTOPERATOR EXPRESSION | LITERAL. LITERAL -> variable | BOOLEAN. BOOLEAN -> true | false. -OPERATOR -> and | or | not. +OPERATOR -> and | or. +NOTOPERATOR -> not. -// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+CONDITIONBODY.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0ACONDITIONBODY+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+.%0D%0AEXPRESSION+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or+%7C+not. \ No newline at end of file +// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+EXPRESSION.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0AEXPRESSION+-%3E+EXPRESSION+OPERATOR+EXPRESSION+%7C+NOTOPERATOR+EXPRESSION+%7C+LITERAL.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or.%0D%0ANOTOPERATOR+-%3E+not. \ No newline at end of file From dd490284c9f781a0aced0cb6a840178bc18f0118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Euri=C5=A1?= Date: Mon, 29 Jan 2024 17:30:49 +0100 Subject: [PATCH 20/20] fix left recursion --- yara_subset.grammar | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yara_subset.grammar b/yara_subset.grammar index 7403344..d0b1894 100644 --- a/yara_subset.grammar +++ b/yara_subset.grammar @@ -4,10 +4,11 @@ RULEBODY -> STRINGS CONDITION | CONDITION . STRINGS -> string colon STRINGSBODY. CONDITION -> condition colon EXPRESSION. STRINGSBODY -> variable assign string STRINGSBODY | eps. -EXPRESSION -> EXPRESSION OPERATOR EXPRESSION | NOTOPERATOR EXPRESSION | LITERAL. +EXPRESSION -> LITERAL EXPRESSION_2 | NOTOPERATOR EXPRESSION. +EXPRESSION_2 -> OPERATOR EXPRESSION EXPRESSION_2 | eps. LITERAL -> variable | BOOLEAN. BOOLEAN -> true | false. OPERATOR -> and | or. NOTOPERATOR -> not. -// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+EXPRESSION.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0AEXPRESSION+-%3E+EXPRESSION+OPERATOR+EXPRESSION+%7C+NOTOPERATOR+EXPRESSION+%7C+LITERAL.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or.%0D%0ANOTOPERATOR+-%3E+not. \ No newline at end of file +// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+EXPRESSION.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0AEXPRESSION+-%3E+LITERAL+EXPRESSION_2+%7C+NOTOPERATOR+EXPRESSION.%0D%0AEXPRESSION_2+-%3E+OPERATOR+EXPRESSION+EXPRESSION_2+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or.%0D%0ANOTOPERATOR+-%3E+not. \ No newline at end of file