diff --git a/.github/workflows/code-health.yml b/.github/workflows/code-health.yml new file mode 100644 index 0000000..fa2944d --- /dev/null +++ b/.github/workflows/code-health.yml @@ -0,0 +1,40 @@ +name: Code health + +on: [push, pull_request] + +jobs: + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.71.0 + with: + components: clippy + - run: cargo clippy --tests --no-deps --all-features -- --deny clippy::all + + rustfmt: + name: Rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.71.0 + with: + components: rustfmt + - run: cargo fmt --all --check + + udeps: + name: Unused dependencies + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Install nightly toolchain + uses: dtolnay/rust-toolchain@nightly + + - name: Run cargo-udeps + uses: aig787/cargo-udeps-action@v1 + with: + version: v0.1.35 + args: '--all-targets' diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..31000a2 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,22 @@ +name: Rust + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose diff --git a/.gitignore b/.gitignore index ea8c4bf..ccb5166 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +.vscode \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 66dc583..8094f17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -14,24 +23,204 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata 0.1.10", +] + +[[package]] +name = "bstr" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "windows-sys", +] + [[package]] name = "countme" version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7704b5fdd17b18ae31c4c1da5a2e0305a2bf17b5249300a9ee9ed7b72114c636" +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "drop_bomb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bda8e21c04aca2ae33ffc2fd8c23134f3cac46db123ba97bd9d3f3b8a4a85e1" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "globset" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +dependencies = [ + "aho-corasick", + "bstr 1.9.0", + "log", + "regex-automata 0.4.4", + "regex-syntax 0.8.2", +] + +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags 2.4.2", + "ignore", + "walkdir", +] + +[[package]] +name = "goldenfile" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a67453a3b358bd8213aedafd4feed75eecab9fb04bed26ba6fdf94694be560" +dependencies = [ + "scopeguard", + "similar-asserts", + "tempfile", + "yansi", +] + [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +[[package]] +name = "ignore" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b46810df39e66e925525d6e38ce1e7f6e1d208f72dc39757880fcb66e2c58af1" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.4", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + [[package]] name = "logos" version = "0.13.0" @@ -51,7 +240,7 @@ dependencies = [ "fnv", "proc-macro2", "quote", - "regex-syntax", + "regex-syntax 0.6.29", "syn", ] @@ -64,6 +253,12 @@ dependencies = [ "logos-codegen", ] +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + [[package]] name = "memoffset" version = "0.9.0" @@ -91,16 +286,48 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-automata" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + [[package]] name = "regex-syntax" version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "rowan-test" version = "0.1.0" -source = "git+https://github.com/TommYDeeee/rowan-test.git#8a8c7aa1bdd6905c508e848a389cbe5237f332dd" +source = "git+https://github.com/avast/avast-rowan.git#357157c01d3bf543b22c70bba1b91fa7e19d8498" dependencies = [ "countme", "hashbrown", @@ -115,6 +342,74 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustix" +version = "0.38.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "similar" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21" +dependencies = [ + "bstr 0.2.17", + "unicode-segmentation", +] + +[[package]] +name = "similar-asserts" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e041bb827d1bfca18f213411d51b665309f1afb37a04a5d1464530e13779fc0f" +dependencies = [ + "console", + "similar", +] + [[package]] name = "syn" version = "2.0.46" @@ -126,6 +421,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + [[package]] name = "text-size" version = "1.1.1" @@ -138,10 +446,132 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "yansi" +version = "1.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377" + [[package]] name = "yara-parser" version = "0.1.0" dependencies = [ + "drop_bomb", + "globwalk", + "goldenfile", "logos", "rowan-test", "text-size", diff --git a/Cargo.toml b/Cargo.toml index 7ddb60e..2a3d280 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,5 +7,10 @@ edition = "2021" [dependencies] logos = "0.13.0" -rowan-test = { git = "https://github.com/TommYDeeee/rowan-test.git" } +rowan-test = { git = "https://github.com/avast/avast-rowan.git" } text-size = "1.1.1" +drop_bomb = "0.1.5" + +[dev-dependencies] +goldenfile = "1.6.0" +globwalk = "0.9.1" diff --git a/example.yar b/example.yar index 25ce9d9..9f68a9b 100644 --- a/example.yar +++ b/example.yar @@ -1,12 +1,15 @@ //Global comment //Rule comment -rule foo +rule test { + //Rule block comment + + //String comment strings: $a = "foo" $b = "bar" condition: - $a and - $b + $a or + $b and true } diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index cee042f..bb30898 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -175,8 +175,7 @@ fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind { LogosToken::True => SyntaxKind::TRUE, LogosToken::False => SyntaxKind::FALSE, LogosToken::Whitespace => SyntaxKind::WHITESPACE, - LogosToken::Comment => SyntaxKind::COMMENT, - LogosToken::MultilineComment => SyntaxKind::MULTILINECOMMENT, + LogosToken::Comment | LogosToken::MultilineComment => SyntaxKind::COMMENT, } } diff --git a/src/main.rs b/src/main.rs index 1a150cb..6bd6dda 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,16 @@ /// This library is used to create a parser for YARA language /// It should provide also token for whitespaces /// as we want full fidelity and error resilience.; -use std::{env::args, fs, path::Path}; +use std::{env::args, fs, io::Write, path::Path}; + +use rowan_test::{GreenNode, NodeOrToken}; use crate::lexer::tokenize; -use crate::syntax::syntax_error::SyntaxError; +use crate::parser::{SyntaxKind, TokenSource, TreeSink}; +use crate::syntax::syntax_node::{SyntaxElement, SyntaxNode}; +use crate::syntax::{ + syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, +}; mod lexer; mod parser; @@ -19,9 +25,85 @@ fn main() { parse_text(&input); } -fn parse_text(text: &str) -> ((), Vec) { - let tokens = tokenize(text); - println!("{:?}", tokens); +fn parse_text(text: &str) -> (GreenNode, Vec) { + let (tokens, lexer_errors) = tokenize(text); + let mut token_source = TextTokenSource::new(text, &tokens); + let mut tree_sink = TextTreeSink::new(text, &tokens); + + parser::parse(&mut token_source, &mut tree_sink); + let (tree, mut parser_errors) = tree_sink.finish(); + parser_errors.extend(lexer_errors); + + let syntax_tree = SyntaxNode::new_root(tree.clone()); + + println!("Tokens: \n{:?}", tokens); + println!(); + println!("Errors: \n{:?}", parser_errors); + println!(); + + let indent = 0; + let result = print(indent, syntax_tree.into()); + + print!("{}", result); + + (tree, parser_errors) +} + +fn print(indent: usize, element: SyntaxElement) -> String { + let mut result = String::new(); + let kind: SyntaxKind = element.kind(); + result.push_str(&format!("{:indent$}", "", indent = indent)); + match element { + NodeOrToken::Node(node) => { + result.push_str(&format!("- {:?}\n", kind)); + for child in node.children_with_tokens() { + result.push_str(&print(indent + 2, child)); + } + } + + NodeOrToken::Token(token) => { + result.push_str(&format!("- {:?} {:?}\n", token.text(), kind)); + } + } + result +} + +#[test] +fn test_parse_text() { + let mut mint = goldenfile::Mint::new("."); + + for entry in globwalk::glob("tests/*.in").unwrap().flatten() { + // Path to the .in.zip file. + let path = entry.into_path(); + let display_path = path.display(); + + let input = fs::read_to_string(&path) + .unwrap_or_else(|_| panic!("Failed to read input file {:?}", display_path)); + + let (tree, errors) = parse_text(&input); + + let out_path = path.with_extension("").with_extension("out"); + let syntax_tree = SyntaxNode::new_root(tree.clone()); + + let output = print(0, syntax_tree.into()); + + let mut output_file = mint.new_goldenfile(out_path).unwrap(); + + write!(output_file, "{}", output).unwrap(); - ((), Vec::new()) + // Check errors + let err_path = path.with_extension("").with_extension("err"); + if err_path.exists() { + let expected_errors = fs::read_to_string(&err_path) + .unwrap_or_else(|_| panic!("Failed to read error file {:?}", err_path.display())); + let actual_errors = errors + .iter() + .map(|error| format!("{:?}", error)) + .collect::>() + .join("\n"); + assert_eq!(actual_errors, expected_errors); + } else { + assert!(errors.is_empty(), "Unexpected errors: {:?}", errors); + } + } } diff --git a/src/parser/event.rs b/src/parser/event.rs new file mode 100644 index 0000000..af2c563 --- /dev/null +++ b/src/parser/event.rs @@ -0,0 +1,74 @@ +use std::mem; + +use crate::parser::{ + ParseError, + SyntaxKind::{self, *}, + TreeSink, +}; + +#[derive(Debug)] +pub(crate) enum Event { + Start { + kind: SyntaxKind, + forward_parent: Option, + }, + + Finish, + + Token { + kind: SyntaxKind, + n_raw_tokens: u8, + }, + + Error { + msg: ParseError, + }, +} + +impl Event { + pub(crate) fn tombstone() -> Self { + Event::Start { + kind: TOMBSTONE, + forward_parent: None, + } + } +} + +pub(crate) fn process(sink: &mut dyn TreeSink, mut events: Vec) { + let mut forward_parents = Vec::new(); + + for i in 0..events.len() { + match mem::replace(&mut events[i], Event::tombstone()) { + Event::Start { + kind, + forward_parent, + } => { + forward_parents.push(kind); + let mut idx = i; + let mut fp = forward_parent; + while let Some(fwd) = fp { + idx += fwd as usize; + fp = match mem::replace(&mut events[idx], Event::tombstone()) { + Event::Start { + kind, + forward_parent, + } => { + forward_parents.push(kind); + forward_parent + } + _ => unreachable!(), + }; + } + + for kind in forward_parents.drain(..).rev() { + if kind != TOMBSTONE { + sink.start_node(kind); + } + } + } + Event::Finish => sink.finish_node(), + Event::Token { kind, n_raw_tokens } => sink.token(kind, n_raw_tokens), + Event::Error { msg } => sink.error(msg), + } + } +} diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs new file mode 100644 index 0000000..1268919 --- /dev/null +++ b/src/parser/grammar.rs @@ -0,0 +1,36 @@ +mod expressions; +mod items; + +use crate::parser::{ + grammar::expressions::rule_body, + parser::{CompletedMarker, Marker, Parser}, + token_set::TokenSet, + SyntaxKind::{self, *}, +}; + +pub(crate) fn parse_source_file(p: &mut Parser) { + let m = p.start(); + + items::mod_content(p, false); + m.complete(p, SOURCE_FILE); +} + +fn error_block(p: &mut Parser, message: &str) { + assert!(p.at(LBRACE)); + let m = p.start(); + p.error(message); + p.bump(LBRACE); + rule_body(p); + p.eat(RBRACE); + m.complete(p, ERROR); +} + +fn name_r(p: &mut Parser<'_>, recovery: TokenSet) { + if p.at(IDENTIFIER) { + let m = p.start(); + p.bump(IDENTIFIER); + m.complete(p, IDENTIFIER); + } else { + p.err_recover("expected a name", recovery); + } +} diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs new file mode 100644 index 0000000..ba715d7 --- /dev/null +++ b/src/parser/grammar/expressions.rs @@ -0,0 +1,169 @@ +mod atom; + +use super::*; + +pub(crate) fn block_expr(p: &mut Parser) { + if !p.at(LBRACE) { + p.error("expected a block expression"); + return; + } + let m = p.start(); + p.bump(LBRACE); + rule_body(p); + p.expect(RBRACE); + m.complete(p, BLOCK_EXPR); +} + +pub(super) fn rule_body(p: &mut Parser) { + let mut has_strings = false; + let mut has_condition = false; + while !p.at(EOF) && !p.at(RBRACE) { + match p.current() { + // add metadata later + STRINGS => { + if has_strings { + p.error("only one strings block is allowed"); + } + if has_condition { + p.error("strings block must come before condition block"); + } + strings(p); + has_strings = true; + } + CONDITION => { + if has_condition { + p.error("only one condition block is allowed"); + } + condition(p); + has_condition = true; + } + _ => { + p.err_and_bump("expected strings or condition"); + } + } + } +} + +fn strings(p: &mut Parser) { + assert!(p.at(STRINGS)); + let m = p.start(); + p.bump(STRINGS); + p.expect(COLON); + strings_body(p); + m.complete(p, STRINGS); +} + +fn condition(p: &mut Parser) { + assert!(p.at(CONDITION)); + let m = p.start(); + p.bump(CONDITION); + p.expect(COLON); + condition_body(p); + m.complete(p, CONDITION); +} + +const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE]); + +pub(super) fn strings_body(p: &mut Parser) { + // add support for meta also + while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { + let m = p.start(); + if p.at(VARIABLE) { + let m = p.start(); + p.bump(VARIABLE); + m.complete(p, VARIABLE); + } else { + p.err_recover("expected a variable", VARIABLE_RECOVERY_SET); + } + p.expect(ASSIGN); + // so far only strings are supported, later add match for hex strings and regex + string(p); + m.complete(p, VARIABLE_STMT); + } +} + +// add support for hex and regex strings later on +fn string(p: &mut Parser) { + let m = p.start(); + match p.current() { + STRING => p.bump(STRING), + _ => p.err_and_bump("expected a string"), + } + // add string modifiers + m.complete(p, STRING); +} + +pub(super) fn condition_body(p: &mut Parser) { + // add support for meta also + while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { + let m = p.start(); + if let Some(cm) = expression(p, Some(m), 1) { + let m = cm.precede(p); + m.complete(p, EXPRESSION_STMT); + } + } +} + +enum Associativity { + Left, + Right, +} + +/// Binding powers of operators for a Pratt parser. +fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { + match p.current() { + // add support for other operators + AND => (4, AND, Associativity::Left), + OR => (3, OR, Associativity::Left), + _ => (0, ERROR, Associativity::Left), + } +} + +fn expression(p: &mut Parser, m: Option, bp: u8) -> Option { + let m = m.unwrap_or_else(|| p.start()); + let mut lhs = match lhs(p) { + Some(lhs) => lhs.extend_to(p, m), + None => { + m.abandon(p); + return None; + } + }; + + loop { + let (op_bp, op, associativity) = current_op(p); + if op_bp < bp { + break; + } + let m = lhs.precede(p); + p.bump(op); + + let op_bp = match associativity { + Associativity::Left => op_bp + 1, + Associativity::Right => op_bp, + }; + expression(p, None, op_bp); + lhs = m.complete(p, EXPRESSION); + } + Some(lhs) +} + +fn lhs(p: &mut Parser) -> Option { + let m; + let kind = match p.current() { + // unary operators + NOT => { + m = p.start(); + p.bump_any(); + PREFIX_EXPR + } + // all other operators + _ => { + let lhs = atom::atom_expr(p)?; + return Some(lhs); + } + }; + // parse unary operators interior + expression(p, None, 255); + let cm = m.complete(p, kind); + Some(cm) +} diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs new file mode 100644 index 0000000..8a87972 --- /dev/null +++ b/src/parser/grammar/expressions/atom.rs @@ -0,0 +1,33 @@ +use super::*; + +// So far the only literals we support are true, false and variables +// numbers will be added later +pub(crate) const LITERAL_FIRST: TokenSet = TokenSet::new(&[TRUE, FALSE, VARIABLE]); + +pub(crate) fn literal(p: &mut Parser) -> Option { + if !p.at_ts(LITERAL_FIRST) { + return None; + } + let m = p.start(); + p.bump_any(); + Some(m.complete(p, LITERAL)) +} + +const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE, NOT]); + +// add support for while/for loops, if/else statements, etc. +pub(super) fn atom_expr(p: &mut Parser) -> Option { + if let Some(m) = literal(p) { + return Some(m); + } + + // This will be extended to support more expressions later + #[allow(clippy::match_single_binding)] + match p.current() { + _ => { + p.err_recover("expected expression", EXPR_RECOVERY_SET); + #[allow(clippy::needless_return)] + return None; + } + }; +} diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs new file mode 100644 index 0000000..ff500ad --- /dev/null +++ b/src/parser/grammar/items.rs @@ -0,0 +1,58 @@ +use super::*; + +pub(super) const RULE_RECOVERY_SET: TokenSet = TokenSet::new( + // Add import here when it is supported + &[ + RULE, // rule + ], +); + +pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { + while !(p.at(EOF) || p.at(RBRACE) && stop_on_r_brace) { + process_top_level(p, stop_on_r_brace); + } +} + +// process either rule, import or include +pub(super) fn process_top_level(p: &mut Parser, stop_on_r_brace: bool) { + let m = p.start(); + let m = match opt_rule_import_include(p, m) { + Ok(()) => { + return; + } + Err(m) => m, + }; + m.abandon(p); + match p.current() { + LBRACE => { + error_block(p, "expected an item"); + } + RBRACE if !stop_on_r_brace => { + let e = p.start(); + p.error("unmatched }"); + p.bump(RBRACE); + e.complete(p, ERROR); + } + EOF | RBRACE => p.error("expected an item"), + _ => p.err_and_bump("expected an item"), + } +} + +// So far in this prototype, we only have one kind of item: a rule. +// In the future, also imports and includes will be supported here +pub(super) fn opt_rule_import_include(p: &mut Parser, m: Marker) -> Result<(), Marker> { + // add rule modifiers to match current and lookahead next with p.nth(1) for RULE or ERROR + match p.current() { + RULE => rule(p, m), + _ => return Err(m), + } + Ok(()) +} + +fn rule(p: &mut Parser, m: Marker) { + p.bump(RULE); + name_r(p, RULE_RECOVERY_SET); + // add optional support for rule tags + expressions::block_expr(p); + m.complete(p, RULE); +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index d097c94..5585781 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1 +1,45 @@ pub mod syntaxkind; + +pub use syntaxkind::SyntaxKind; +mod event; +mod grammar; +#[allow(clippy::module_inception)] +mod parser; +mod token_set; + +use grammar::parse_source_file; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ParseError(pub String); + +pub trait TokenSource { + fn current(&self) -> Token; + + fn lookahead_nth(&self, n: usize) -> Token; + + fn bump(&mut self); +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Token { + pub kind: SyntaxKind, + + pub is_jointed_to_next: bool, +} + +pub trait TreeSink { + fn token(&mut self, kind: SyntaxKind, n_tokens: u8); + + fn start_node(&mut self, kind: SyntaxKind); + + fn finish_node(&mut self); + + fn error(&mut self, error: ParseError); +} + +pub fn parse(token_source: &mut dyn TokenSource, tree_sink: &mut dyn TreeSink) { + let mut p = parser::Parser::new(token_source); + parse_source_file(&mut p); + let events = p.finish(); + event::process(tree_sink, events) +} diff --git a/src/parser/parser.rs b/src/parser/parser.rs new file mode 100644 index 0000000..a9fef81 --- /dev/null +++ b/src/parser/parser.rs @@ -0,0 +1,204 @@ +use std::cell::Cell; + +use drop_bomb::DropBomb; + +use crate::parser::{ + event::Event, + token_set::TokenSet, + ParseError, + SyntaxKind::{self, EOF, ERROR, LBRACE, RBRACE, TOMBSTONE}, + TokenSource, +}; + +pub(crate) struct Parser<'t> { + token_source: &'t mut dyn TokenSource, + events: Vec, + steps: Cell, +} + +impl<'t> Parser<'t> { + pub(crate) fn new(token_source: &'t mut dyn TokenSource) -> Parser<'t> { + Parser { + token_source, + events: Vec::new(), + steps: Cell::new(0), + } + } + + pub(crate) fn finish(self) -> Vec { + self.events + } + + pub(crate) fn current(&self) -> SyntaxKind { + self.nth(0) + } + + pub(crate) fn nth(&self, n: usize) -> SyntaxKind { + assert!(n < 3); + + let steps = self.steps.get(); + assert!(steps <= 10000000, "infinite loop detected"); + self.steps.set(steps + 1); + + self.token_source.lookahead_nth(n).kind + } + + pub(crate) fn at(&self, kind: SyntaxKind) -> bool { + // currently we don't need support for composite tokens (e.g. `>>`) + self.token_source.lookahead_nth(0).kind == kind + } + + pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool { + if !self.at(kind) { + return false; + } + + // currently we don't need support for composite tokens (e.g. `>>`) + let n_raw_tokens = 1; + self.do_bump(kind, n_raw_tokens); + true + } + + pub(crate) fn at_ts(&self, kinds: TokenSet) -> bool { + kinds.contains(self.current()) + } + + pub(crate) fn start(&mut self) -> Marker { + let pos = self.events.len() as u32; + self.push_event(Event::tombstone()); + Marker::new(pos) + } + + pub(crate) fn bump(&mut self, kind: SyntaxKind) { + assert!(self.eat(kind)); + } + + pub(crate) fn bump_any(&mut self) { + let kind = self.nth(0); + if kind == EOF { + return; + } + self.do_bump(kind, 1); + } + + fn do_bump(&mut self, kind: SyntaxKind, n_raw_tokens: u8) { + for _ in 0..n_raw_tokens { + self.token_source.bump(); + } + + self.push_event(Event::Token { kind, n_raw_tokens }); + } + + fn push_event(&mut self, event: Event) { + self.events.push(event); + } + + pub(crate) fn error>(&mut self, message: T) { + let msg = ParseError(message.into()); + self.push_event(Event::Error { msg }); + } + + pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool { + if self.eat(kind) { + return true; + } + self.error(format!("expected {:?}", kind)); + false + } + + pub(crate) fn err_and_bump(&mut self, message: &str) { + self.err_recover(message, TokenSet::EMPTY) + } + + pub(crate) fn err_recover(&mut self, message: &str, recovery: TokenSet) { + if self.at_ts(recovery) { + self.error(message); + return; + } + + let m = self.start(); + self.error(message); + self.bump_any(); + m.complete(self, ERROR); + } +} + +pub(crate) struct Marker { + pos: u32, + bomb: DropBomb, +} + +impl Marker { + fn new(pos: u32) -> Marker { + Marker { + pos, + bomb: DropBomb::new("Marker must be either completed or abandoned"), + } + } + + pub(crate) fn complete(mut self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker { + self.bomb.defuse(); + let idx = self.pos as usize; + match &mut p.events[idx] { + Event::Start { kind: slot, .. } => { + *slot = kind; + } + _ => unreachable!(), + } + p.push_event(Event::Finish); + CompletedMarker::new(self.pos, kind) + } + + pub(crate) fn abandon(mut self, p: &mut Parser) { + self.bomb.defuse(); + let idx = self.pos as usize; + if idx == p.events.len() - 1 { + match p.events.pop() { + Some(Event::Start { + kind: TOMBSTONE, + forward_parent: None, + }) => (), + _ => unreachable!(), + } + } + } +} + +pub(crate) struct CompletedMarker { + pos: u32, + kind: SyntaxKind, +} + +impl CompletedMarker { + fn new(pos: u32, kind: SyntaxKind) -> Self { + CompletedMarker { pos, kind } + } + + pub(crate) fn precede(self, p: &mut Parser) -> Marker { + let new_pos = p.start(); + let idx = self.pos as usize; + match &mut p.events[idx] { + Event::Start { forward_parent, .. } => { + *forward_parent = Some(new_pos.pos - self.pos); + } + _ => unreachable!(), + } + new_pos + } + + pub(crate) fn extend_to(self, p: &mut Parser, mut m: Marker) -> CompletedMarker { + m.bomb.defuse(); + let idx = m.pos as usize; + match &mut p.events[idx] { + Event::Start { forward_parent, .. } => { + *forward_parent = Some(self.pos - m.pos); + } + _ => unreachable!(), + } + self + } + + pub(crate) fn kind(&self) -> SyntaxKind { + self.kind + } +} diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index dfd14d3..fc47c2c 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -1,3 +1,5 @@ +#![allow(clippy::upper_case_acronyms)] + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] #[repr(u16)] pub enum SyntaxKind { @@ -24,6 +26,35 @@ pub enum SyntaxKind { FALSE, WHITESPACE, COMMENT, - MULTILINECOMMENT, ERROR, + SOURCE_FILE, + BLOCK_EXPR, + PREFIX_EXPR, + LITERAL, + EXPRESSION, + EXPRESSION_STMT, + VARIABLE_STMT, + __LAST, +} + +impl From for SyntaxKind { + #[inline] + fn from(d: u16) -> SyntaxKind { + assert!(d <= (SyntaxKind::__LAST as u16)); + unsafe { std::mem::transmute::(d) } + } +} + +impl From for u16 { + #[inline] + fn from(k: SyntaxKind) -> u16 { + k as u16 + } +} + +impl SyntaxKind { + #[inline] + pub fn is_trivia(self) -> bool { + matches!(self, SyntaxKind::WHITESPACE | SyntaxKind::COMMENT) + } } diff --git a/src/parser/token_set.rs b/src/parser/token_set.rs new file mode 100644 index 0000000..fe3d907 --- /dev/null +++ b/src/parser/token_set.rs @@ -0,0 +1,26 @@ +use crate::parser::SyntaxKind; + +#[derive(Clone, Copy)] +pub(crate) struct TokenSet(u128); + +impl TokenSet { + pub(crate) const EMPTY: TokenSet = TokenSet(0); + + pub(crate) const fn new(kinds: &[SyntaxKind]) -> TokenSet { + let mut res = 0u128; + let mut i = 0; + while i < kinds.len() { + res |= mask(kinds[i]); + i += 1; + } + TokenSet(res) + } + + pub(crate) const fn contains(&self, kind: SyntaxKind) -> bool { + self.0 & mask(kind) != 0 + } +} + +const fn mask(kind: SyntaxKind) -> u128 { + 1u128 << (kind as usize) +} diff --git a/src/syntax/ast/mod.rs b/src/syntax/ast/mod.rs new file mode 100644 index 0000000..2dd05f8 --- /dev/null +++ b/src/syntax/ast/mod.rs @@ -0,0 +1,72 @@ +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct CommentKind { + pub shape: CommentShape, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum CommentShape { + Line, + Block, +} + +impl CommentShape { + pub fn is_line(self) -> bool { + self == CommentShape::Line + } + + pub fn is_block(self) -> bool { + self == CommentShape::Block + } +} + +impl CommentKind { + const BY_PREFIX: [(&'static str, CommentKind); 5] = [ + ( + "/**/", + CommentKind { + shape: CommentShape::Block, + }, + ), + ( + "/***", + CommentKind { + shape: CommentShape::Block, + }, + ), + ( + "////", + CommentKind { + shape: CommentShape::Line, + }, + ), + ( + "//", + CommentKind { + shape: CommentShape::Line, + }, + ), + ( + "/*", + CommentKind { + shape: CommentShape::Block, + }, + ), + ]; + + pub(crate) fn from_text(text: &str) -> CommentKind { + let &(_prefix, kind) = CommentKind::BY_PREFIX + .iter() + .find(|&(prefix, _kind)| text.starts_with(prefix)) + .unwrap(); + kind + } + + pub fn prefix(&self) -> &'static str { + let &(prefix, _) = CommentKind::BY_PREFIX + .iter() + .rev() + .find(|(_, kind)| kind == self) + .unwrap(); + prefix + } +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 98e8d1c..0ebb190 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1 +1,5 @@ +pub mod ast; pub mod syntax_error; +pub mod syntax_node; +pub mod text_token_source; +pub mod text_tree_sink; diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs new file mode 100644 index 0000000..8d762d0 --- /dev/null +++ b/src/syntax/syntax_node.rs @@ -0,0 +1,68 @@ +//! This module defines Concrete Syntax Tree (CST), used by rust-analyzer. +//! +//! The CST includes comments and whitespace, provides a single node type, +//! `SyntaxNode`, and a basic traversal API (parent, children, siblings). +//! +//! The *real* implementation is in the (language-agnostic) `rowan` crate, this +//! module just wraps its API. + +use rowan_test::{GreenNodeBuilder, Language}; +use text_size::TextSize; + +use crate::parser::{self, syntaxkind::SyntaxKind}; +use crate::SyntaxError; + +pub(crate) use rowan_test::GreenNode; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum YARALanguage {} +impl Language for YARALanguage { + type Kind = SyntaxKind; + + fn kind_from_raw(raw: rowan_test::SyntaxKind) -> SyntaxKind { + SyntaxKind::from(raw.0) + } + + fn kind_to_raw(kind: SyntaxKind) -> rowan_test::SyntaxKind { + rowan_test::SyntaxKind(kind.into()) + } +} + +pub type SyntaxNode = rowan_test::SyntaxNode; +pub type SyntaxToken = rowan_test::SyntaxToken; +pub type SyntaxElement = rowan_test::SyntaxElement; +pub type SyntaxNodeChildren = rowan_test::SyntaxNodeChildren; +pub type SyntaxElementChildren = rowan_test::SyntaxElementChildren; +pub type PreorderWithTokens = rowan_test::api::PreorderWithTokens; + +#[derive(Default)] +pub struct SyntaxTreeBuilder { + errors: Vec, + inner: GreenNodeBuilder<'static>, +} + +impl SyntaxTreeBuilder { + pub(crate) fn finish_raw(self) -> (GreenNode, Vec) { + let green = self.inner.finish(); + (green, self.errors) + } + + pub fn token(&mut self, kind: SyntaxKind, text: &str) { + let kind = YARALanguage::kind_to_raw(kind); + self.inner.token(kind, text) + } + + pub fn start_node(&mut self, kind: SyntaxKind) { + let kind = YARALanguage::kind_to_raw(kind); + self.inner.start_node(kind) + } + + pub fn finish_node(&mut self) { + self.inner.finish_node() + } + + pub fn error(&mut self, error: parser::ParseError, text_pos: TextSize) { + self.errors + .push(SyntaxError::new_at_offset(error.0, text_pos)) + } +} diff --git a/src/syntax/text_token_source.rs b/src/syntax/text_token_source.rs new file mode 100644 index 0000000..279d7f1 --- /dev/null +++ b/src/syntax/text_token_source.rs @@ -0,0 +1,76 @@ +use crate::{ + lexer::Token, + parser::{self, SyntaxKind::EOF, TokenSource}, +}; +use text_size::{TextRange, TextSize}; + +pub(crate) struct TextTokenSource<'t> { + text: &'t str, + + token_offset_pairs: Vec<(Token, TextSize)>, + + curr: (parser::Token, usize), +} + +impl<'t> TokenSource for TextTokenSource<'t> { + fn current(&self) -> parser::Token { + self.curr.0 + } + + fn lookahead_nth(&self, n: usize) -> parser::Token { + mk_token(self.curr.1 + n, &self.token_offset_pairs) + } + + fn bump(&mut self) { + if self.curr.0.kind == EOF { + return; + } + + let pos = self.curr.1 + 1; + self.curr = (mk_token(pos, &self.token_offset_pairs), pos); + } +} + +fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { + let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) { + Some((token, offset)) => ( + token.kind, + token_offset_pairs + .get(pos + 1) + .map(|(_, next_offset)| offset + token.len == *next_offset) + .unwrap_or(false), + ), + None => (EOF, false), + }; + parser::Token { + kind, + is_jointed_to_next, + } +} + +impl<'t> TextTokenSource<'t> { + pub(crate) fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> { + let token_offset_pairs: Vec<_> = raw_tokens + .iter() + .filter_map({ + let mut len = 0.into(); + move |token| { + let pair = if token.kind.is_trivia() { + None + } else { + Some((*token, len)) + }; + len += token.len; + pair + } + }) + .collect(); + + let first = mk_token(0, &token_offset_pairs); + TextTokenSource { + text, + token_offset_pairs, + curr: (first, 0), + } + } +} diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs new file mode 100644 index 0000000..63f2992 --- /dev/null +++ b/src/syntax/text_tree_sink.rs @@ -0,0 +1,168 @@ +use std::mem; +use text_size::{TextRange, TextSize}; + +use crate::{ + lexer::Token, + parser::{ParseError, SyntaxKind, TreeSink}, + syntax::{ + ast, syntax_error::SyntaxError, syntax_node::GreenNode, syntax_node::SyntaxTreeBuilder, + }, +}; + +pub(crate) struct TextTreeSink<'a> { + text: &'a str, + tokens: &'a [Token], + text_pos: TextSize, + token_pos: usize, + state: State, + inner: SyntaxTreeBuilder, +} + +enum State { + PendingStart, + Normal, + PendingFinish, +} + +impl<'a> TreeSink for TextTreeSink<'a> { + fn token(&mut self, kind: SyntaxKind, n_tokens: u8) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingStart => unreachable!(), + State::PendingFinish => { + self.inner.finish_node(); + } + State::Normal => (), + } + self.eat_trivias(); + let n_tokens = n_tokens as usize; + let len = self.tokens[self.token_pos..self.token_pos + n_tokens] + .iter() + .map(|it| it.len) + .sum::(); + self.do_token(kind, len, n_tokens); + } + + fn start_node(&mut self, kind: SyntaxKind) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingStart => { + self.inner.start_node(kind); + return; + } + State::PendingFinish => { + self.inner.finish_node(); + } + State::Normal => (), + } + + let n_trivias = self.tokens[self.token_pos..] + .iter() + .take_while(|it| it.kind.is_trivia()) + .count(); + let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; + let mut trivia_end = + self.text_pos + leading_trivias.iter().map(|it| it.len).sum::(); + + let n_attached_trivias = { + let leading_trivias = leading_trivias.iter().rev().map(|it| { + let next_end = trivia_end - it.len; + let range = TextRange::new(next_end, trivia_end); + trivia_end = next_end; + (it.kind, &self.text[range]) + }); + n_attached_trivias(kind, leading_trivias) + }; + self.eat_n_trivias(n_trivias - n_attached_trivias); + self.inner.start_node(kind); + self.eat_n_trivias(n_attached_trivias); + } + + fn finish_node(&mut self) { + match mem::replace(&mut self.state, State::PendingFinish) { + State::PendingStart => unreachable!(), + State::PendingFinish => { + self.inner.finish_node(); + } + State::Normal => (), + } + } + + fn error(&mut self, error: ParseError) { + self.inner.error(error, self.text_pos) + } +} + +impl<'a> TextTreeSink<'a> { + pub(crate) fn new(text: &'a str, tokens: &'a [Token]) -> Self { + Self { + text, + tokens, + text_pos: 0.into(), + token_pos: 0, + state: State::PendingStart, + inner: SyntaxTreeBuilder::default(), + } + } + + pub(crate) fn finish(mut self) -> (GreenNode, Vec) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingFinish => { + self.eat_trivias(); + self.inner.finish_node() + } + State::PendingStart | State::Normal => unreachable!(), + } + + self.inner.finish_raw() + } + + fn eat_trivias(&mut self) { + while let Some(&token) = self.tokens.get(self.token_pos) { + if !token.kind.is_trivia() { + break; + } + self.do_token(token.kind, token.len, 1); + } + } + + fn eat_n_trivias(&mut self, n: usize) { + for _ in 0..n { + let token = self.tokens[self.token_pos]; + assert!(token.kind.is_trivia()); + self.do_token(token.kind, token.len, 1); + } + } + + fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) { + let range = TextRange::at(self.text_pos, len); + let text = &self.text[range]; + self.text_pos += len; + self.token_pos += n_tokens; + self.inner.token(kind, text); + } +} + +fn n_attached_trivias<'a>( + kind: SyntaxKind, + trivias: impl Iterator, +) -> usize { + match kind { + SyntaxKind::RULE | SyntaxKind::BLOCK_EXPR | SyntaxKind::STRINGS | SyntaxKind::CONDITION => { + let mut res = 0; + let trivias = trivias.enumerate().peekable(); + + for (i, (kind, text)) in trivias { + match kind { + SyntaxKind::WHITESPACE if text.contains("\n\n") => { + break; + } + SyntaxKind::COMMENT => { + res = i + 1; + } + _ => (), + } + } + res + } + _ => 0, + } +} diff --git a/tests/test1.in b/tests/test1.in new file mode 100644 index 0000000..69ed034 --- /dev/null +++ b/tests/test1.in @@ -0,0 +1,7 @@ +rule test +{ + strings: + $a = "foo" + condition: + $a +} diff --git a/tests/test1.out b/tests/test1.out new file mode 100644 index 0000000..f4c65e8 --- /dev/null +++ b/tests/test1.out @@ -0,0 +1,33 @@ +- SOURCE_FILE + - RULE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - STRINGS + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - LITERAL + - "$a" VARIABLE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test2.in b/tests/test2.in new file mode 100644 index 0000000..4e26293 --- /dev/null +++ b/tests/test2.in @@ -0,0 +1,9 @@ +rule test +{ + strings: + $a = "foo" + $b = "bar" + condition: + $a or + $b +} diff --git a/tests/test2.out b/tests/test2.out new file mode 100644 index 0000000..68899c1 --- /dev/null +++ b/tests/test2.out @@ -0,0 +1,48 @@ +- SOURCE_FILE + - RULE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - STRINGS + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - LITERAL + - "$b" VARIABLE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test3.in b/tests/test3.in new file mode 100644 index 0000000..9f68a9b --- /dev/null +++ b/tests/test3.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a or + $b and true +} diff --git a/tests/test3.out b/tests/test3.out new file mode 100644 index 0000000..1d407f9 --- /dev/null +++ b/tests/test3.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - EXPRESSION + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test4.err b/tests/test4.err new file mode 100644 index 0000000..bf9ebfc --- /dev/null +++ b/tests/test4.err @@ -0,0 +1 @@ +SyntaxError("expected a variable", 98..98) \ No newline at end of file diff --git a/tests/test4.in b/tests/test4.in new file mode 100644 index 0000000..8f0a414 --- /dev/null +++ b/tests/test4.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + a = "foo" + $b = "bar" + condition: + $a or + $b and true +} diff --git a/tests/test4.out b/tests/test4.out new file mode 100644 index 0000000..699f73c --- /dev/null +++ b/tests/test4.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - EXPRESSION + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test5.err b/tests/test5.err new file mode 100644 index 0000000..af68e68 --- /dev/null +++ b/tests/test5.err @@ -0,0 +1 @@ +SyntaxError("expected expression", 144..144) \ No newline at end of file diff --git a/tests/test5.in b/tests/test5.in new file mode 100644 index 0000000..75bfc9f --- /dev/null +++ b/tests/test5.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a or + b and true +} diff --git a/tests/test5.out b/tests/test5.out new file mode 100644 index 0000000..5753b77 --- /dev/null +++ b/tests/test5.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - ERROR + - "b" IDENTIFIER + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test6.err b/tests/test6.err new file mode 100644 index 0000000..b6080fd --- /dev/null +++ b/tests/test6.err @@ -0,0 +1,13 @@ +SyntaxError("expected a name", 38..38) +SyntaxError("expected strings or condition", 92..92) +SyntaxError("expected strings or condition", 98..98) +SyntaxError("expected strings or condition", 102..102) +SyntaxError("expected strings or condition", 104..104) +SyntaxError("expected strings or condition", 106..106) +SyntaxError("expected strings or condition", 114..114) +SyntaxError("expected strings or condition", 117..117) +SyntaxError("expected strings or condition", 119..119) +SyntaxError("expected expression", 139..139) +SyntaxError("expected expression", 141..141) +SyntaxError("expected expression", 150..150) +SyntaxError("Invalid character", 98..99) \ No newline at end of file diff --git a/tests/test6.in b/tests/test6.in new file mode 100644 index 0000000..cc3cb4e --- /dev/null +++ b/tests/test6.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule condition +{ + //Rule block comment + + //String comment + string* + a = 00000 + $b = "bar" + condition: + a ord + $b ant +} diff --git a/tests/test6.out b/tests/test6.out new file mode 100644 index 0000000..9aeedac --- /dev/null +++ b/tests/test6.out @@ -0,0 +1,60 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - ERROR + - "condition" CONDITION + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - ERROR + - "string" IDENTIFIER + - ERROR + - "*" ERROR + - "\n\t\t" WHITESPACE + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - ERROR + - "=" ASSIGN + - " " WHITESPACE + - ERROR + - "00000" NUMBER + - "\n\t\t" WHITESPACE + - ERROR + - "$b" VARIABLE + - " " WHITESPACE + - ERROR + - "=" ASSIGN + - " " WHITESPACE + - ERROR + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - ERROR + - "ord" IDENTIFIER + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - ERROR + - "ant" IDENTIFIER + - " \n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/yara_subset.grammar b/yara_subset.grammar new file mode 100644 index 0000000..d0b1894 --- /dev/null +++ b/yara_subset.grammar @@ -0,0 +1,14 @@ +SOURCE -> RULE | eps. +RULE -> rule identifier lbrace RULEBODY rbrace. +RULEBODY -> STRINGS CONDITION | CONDITION . +STRINGS -> string colon STRINGSBODY. +CONDITION -> condition colon EXPRESSION. +STRINGSBODY -> variable assign string STRINGSBODY | eps. +EXPRESSION -> LITERAL EXPRESSION_2 | NOTOPERATOR EXPRESSION. +EXPRESSION_2 -> OPERATOR EXPRESSION EXPRESSION_2 | eps. +LITERAL -> variable | BOOLEAN. +BOOLEAN -> true | false. +OPERATOR -> and | or. +NOTOPERATOR -> not. + +// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+EXPRESSION.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0AEXPRESSION+-%3E+LITERAL+EXPRESSION_2+%7C+NOTOPERATOR+EXPRESSION.%0D%0AEXPRESSION_2+-%3E+OPERATOR+EXPRESSION+EXPRESSION_2+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or.%0D%0ANOTOPERATOR+-%3E+not. \ No newline at end of file