From d07b5c68595b6fb5209194967e61cf16b3f0d3d3 Mon Sep 17 00:00:00 2001 From: Minigugus <43109623+Minigugus@users.noreply.github.com> Date: Sat, 24 Feb 2024 20:27:41 +0100 Subject: [PATCH] Add doc comment parsing Add token end offsets --- src/lexer.rs | 206 ++++++++++++++++++++++++++++---------------- src/parser.rs | 75 +++++++++++++--- src/printer/java.rs | 4 + src/printer/rust.rs | 5 ++ 4 files changed, 206 insertions(+), 84 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index f858528..3c1cb04 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,6 +1,7 @@ use alloc::vec; use alloc::vec::Vec; use core::fmt::{Debug, Formatter}; +use core::ops::Range; #[derive(Eq, PartialEq, Copy, Clone, Debug)] pub enum TokenKind<'a> { @@ -11,6 +12,7 @@ pub enum TokenKind<'a> { Colon, ColonColon, Comma, + DocComment(&'a str), DoubleArrow, Equal, EqualEqual, @@ -37,13 +39,13 @@ pub enum TokenKind<'a> { #[derive(Eq, PartialEq, Clone)] pub struct Token<'a> { pub kind: TokenKind<'a>, - offset: usize, + offset: Range, } impl<'a> Debug for Token<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { let Token { kind, offset } = self; - write!(f, "{kind:?} @ {offset}") + write!(f, "{kind:?} @ {offset:?}") } } @@ -65,6 +67,7 @@ impl<'a> Token<'a> { c if char::is_ascii_whitespace(&c) => Self::skip_while(left, |c| c.is_whitespace()), // comment + '/' if matches!(left.get(0..3), Some("///")) => Self::doc_comment(left), '/' if matches!(left.get(0..2), Some("//")) => Self::skip_while(left, |c| c != '\n'), // symbol @@ -111,7 +114,7 @@ impl<'a> Token<'a> { if let Some(kind) = kind { tokens.push(Token { kind, - offset, + offset: offset..(offset + consumed), }); } left = &left[consumed..]; @@ -147,6 +150,15 @@ impl<'a> Token<'a> { Ok((index, Some(TokenKind::Number((&left[0..index]).parse().map_err(|_| "malformed number literal")?)))) } + fn doc_comment(left: &str) -> (usize, Option) { + let index = Self::get_while( + left, + |c| c != '\n', + ); + + (index, Some(TokenKind::DocComment(&left[3..index]))) + } + fn symbol(left: &str) -> (usize, Option) { let index = Self::get_while( left, @@ -159,7 +171,7 @@ impl<'a> Token<'a> { fn assert_tokenize( input: &str, - expected: &[(TokenKind<'static>, usize)], + expected: &[(TokenKind<'static>, Range)], ) { let tokens = match Token::parse_ascii(input) { Ok(tokens) => tokens, @@ -183,11 +195,11 @@ fn it_can_deal_with_utf8_characters() { assert_tokenize( "p😀ub 😁;", &[ - (Symbol("p"), 0), - (Unexpected('😀'), 1), - (Symbol("ub"), 5), - (Unexpected('😁'), 8), - (Semicolon, 12) + (Symbol("p"), 0..1), + (Unexpected('😀'), 1..5), + (Symbol("ub"), 5..7), + (Unexpected('😁'), 8..12), + (Semicolon, 12..13) ], ) } @@ -199,10 +211,10 @@ fn it_tokenize_mod() { assert_tokenize( "pub mod parser ;", &[ - (Symbol("pub"), 0), - (Symbol("mod"), 5), - (Symbol("parser"), 11), - (Semicolon, 19), + (Symbol("pub"), 0..3), + (Symbol("mod"), 5..8), + (Symbol("parser"), 11..17), + (Semicolon, 19..20) ], ) } @@ -218,23 +230,23 @@ fn it_tokenize_enum() { StopLimit { stop_price: f64, }, }"#, &[ - (Symbol("pub"), 0), - (Symbol("enum"), 4), - (Symbol("Price"), 9), - (BraceOpen, 15), - (Symbol("Limit"), 19), - (Comma, 24), - (Symbol("Market"), 28), - (Comma, 34), - (Symbol("StopLimit"), 38), - (BraceOpen, 48), - (Symbol("stop_price"), 50), - (Colon, 60), - (Symbol("f64"), 62), - (Comma, 65), - (BraceClose, 67), - (Comma, 68), - (BraceClose, 70), + (Symbol("pub"), 0..3), + (Symbol("enum"), 4..8), + (Symbol("Price"), 9..14), + (BraceOpen, 15..16), + (Symbol("Limit"), 19..24), + (Comma, 24..25), + (Symbol("Market"), 28..34), + (Comma, 34..35), + (Symbol("StopLimit"), 38..47), + (BraceOpen, 48..49), + (Symbol("stop_price"), 50..60), + (Colon, 60..61), + (Symbol("f64"), 62..65), + (Comma, 65..66), + (BraceClose, 67..68), + (Comma, 68..69), + (BraceClose, 70..71) ], ) } @@ -249,19 +261,19 @@ fn it_tokenize_struct() { offset: usize, }"#, &[ - (Symbol("pub"), 0), - (Symbol("struct"), 4), - (Symbol("Token"), 11), - (BraceOpen, 17), - (Symbol("kind"), 21), - (Colon, 25), - (Symbol("Token"), 27), - (Comma, 32), - (Symbol("offset"), 36), - (Colon, 42), - (Symbol("usize"), 44), - (Comma, 49), - (BraceClose, 51), + (Symbol("pub"), 0..3), + (Symbol("struct"), 4..10), + (Symbol("Token"), 11..16), + (BraceOpen, 17..18), + (Symbol("kind"), 21..25), + (Colon, 25..26), + (Symbol("Token"), 27..32), + (Comma, 32..33), + (Symbol("offset"), 36..42), + (Colon, 42..43), + (Symbol("usize"), 44..49), + (Comma, 49..50), + (BraceClose, 51..52) ], ) } @@ -278,35 +290,85 @@ fn it_tokenize_fn() { } }"#, &[ - (Symbol("fn"), 0), - (Symbol("is_priced_type"), 3), - (ParenthesisOpen, 17), - (Symbol("type"), 18), - (Colon, 22), - (Symbol("Price"), 24), - (ParenthesisClose, 29), - (BraceOpen, 31), - (Symbol("match"), 35), - (Symbol("type"), 41), - (BraceOpen, 46), - (Symbol("Price"), 52), - (ColonColon, 57), - (Symbol("Limit"), 59), - (Pipe, 65), - (Symbol("Price"), 67), - (ColonColon, 72), - (Symbol("StopLimit"), 74), - (BraceOpen, 84), - (DotDot, 86), - (BraceClose, 89), - (DoubleArrow, 91), - (Symbol("true"), 94), - (Comma, 98), - (Underscore, 104), - (DoubleArrow, 106), - (Symbol("false"), 109), - (BraceClose, 117), - (BraceClose, 119) + (Symbol("fn"), 0..2), + (Symbol("is_priced_type"), 3..17), + (ParenthesisOpen, 17..18), + (Symbol("type"), 18..22), + (Colon, 22..23), + (Symbol("Price"), 24..29), + (ParenthesisClose, 29..30), + (BraceOpen, 31..32), + (Symbol("match"), 35..40), + (Symbol("type"), 41..45), + (BraceOpen, 46..47), + (Symbol("Price"), 52..57), + (ColonColon, 57..59), + (Symbol("Limit"), 59..64), + (Pipe, 65..66), + (Symbol("Price"), 67..72), + (ColonColon, 72..74), + (Symbol("StopLimit"), 74..83), + (BraceOpen, 84..85), + (DotDot, 86..88), + (BraceClose, 89..90), + (DoubleArrow, 91..93), + (Symbol("true"), 94..98), + (Comma, 98..99), + (Underscore, 104..105), + (DoubleArrow, 106..108), + (Symbol("false"), 109..114), + (BraceClose, 117..118), + (BraceClose, 119..120) + ], + ) +} + +#[test] +fn it_tokenize_fn_with_doc_comments() { + use crate::lexer::TokenKind::*; + + assert_tokenize( + r#" +/// Whether or not this type of price accept a price +/// (e.g MARKET does not but LIMIT does) +fn is_priced_type(type: Price) { + match type { + Price::Limit | Price::StopLimit { .. } => true, + _ => false + } +}"#, + &[ + (DocComment(" Whether or not this type of price accept a price"), 1..53), + (DocComment(" (e.g MARKET does not but LIMIT does)"), 54..94), + (Symbol("fn"), 95..97), + (Symbol("is_priced_type"), 98..112), + (ParenthesisOpen, 112..113), + (Symbol("type"), 113..117), + (Colon, 117..118), + (Symbol("Price"), 119..124), + (ParenthesisClose, 124..125), + (BraceOpen, 126..127), + (Symbol("match"), 130..135), + (Symbol("type"), 136..140), + (BraceOpen, 141..142), + (Symbol("Price"), 147..152), + (ColonColon, 152..154), + (Symbol("Limit"), 154..159), + (Pipe, 160..161), + (Symbol("Price"), 162..167), + (ColonColon, 167..169), + (Symbol("StopLimit"), 169..178), + (BraceOpen, 179..180), + (DotDot, 181..183), + (BraceClose, 184..185), + (DoubleArrow, 186..188), + (Symbol("true"), 189..193), + (Comma, 193..194), + (Underscore, 199..200), + (DoubleArrow, 201..203), + (Symbol("false"), 204..209), + (BraceClose, 212..213), + (BraceClose, 214..215) ], ) } diff --git a/src/parser.rs b/src/parser.rs index 43d0d1e..50727e4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,5 +1,6 @@ use alloc::{format, vec}; use alloc::borrow::Cow; +use alloc::boxed::Box; use alloc::rc::Rc; use alloc::vec::Vec; @@ -137,6 +138,7 @@ pub enum Expression<'a> { #[derive(Eq, PartialEq, Debug)] pub struct FunctionPrototype<'a> { + pub doc: Box<[&'a str]>, pub visibility: Visibility, pub name: Identifier<'a>, pub parameters: Vec>, @@ -196,6 +198,7 @@ impl<'a> AsRef for Identifier<'a> { #[derive(Eq, PartialEq, Debug)] pub struct NamedField<'a> { + pub doc: Box<[&'a str]>, pub visibility: Visibility, pub name: Identifier<'a>, pub typ: Type<'a>, @@ -216,12 +219,14 @@ pub enum Fields<'a> { #[derive(Eq, PartialEq, Debug)] pub struct EnumVariant<'a> { + pub doc: Box<[&'a str]>, pub name: Identifier<'a>, pub fields: Fields<'a>, } #[derive(Eq, PartialEq, Debug)] pub struct StructOrEnum<'a, T> { + pub doc: Box<[&'a str]>, pub visibility: Visibility, pub name: Identifier<'a>, pub body: T, @@ -280,6 +285,16 @@ fn parse_optional_group_with_dotdot<'a, T, F: Fn(&mut Vec>) -> Result< Ok(Some((items, dotdot_at_end))) } +fn parse_doc_comments<'a>(tokens: &mut Vec>) -> Box<[&'a str]> { + let mut result = vec![]; + while let Some(DocComment(content)) = peek_token(tokens) { + let content = *content; + tokens.remove(0); + result.push(content); + } + result.into_boxed_slice() +} + fn parse_visibility(tokens: &mut Vec) -> Visibility { if let Some(Symbol("pub")) = peek_token(tokens) { tokens.remove(0); @@ -373,17 +388,19 @@ fn parse_type<'a>(tokens: &mut Vec>) -> Result>> { fn parse_named_field<'a>(tokens: &mut Vec>) -> Result> { // let visibility = parse_visibility(tokens); // let name = parse_identifier(tokens).unwrap_or_expected_at(tokens, "expected a field name")?; + let doc = parse_doc_comments(tokens); let (name, visibility) = parse_identifier_and_keywords(tokens, Colon) .when_parsing("the named field identifier and visibility")? .unwrap_or_expected_at(tokens, "expected the `pub` keyword or an field identifier")?; - let r#type = parse_type(tokens) + let typ = parse_type(tokens) .when_parsing("the named field type")? .unwrap_or_expected_at(tokens, "expected a field type")?; Ok(NamedField { + doc, visibility, name, - typ: r#type, + typ, }) } @@ -411,7 +428,7 @@ pub fn parse_fields<'a>(tokens: &mut Vec>) -> Result> { }) } -fn parse_struct_inner<'a>(tokens: &mut Vec>, visibility: Visibility) -> Result> { +fn parse_struct_inner<'a>(tokens: &mut Vec>, visibility: Visibility, doc: Box<[&'a str]>) -> Result> { eat_token(tokens, Symbol("struct")).unwrap_or_expected_at(tokens, "expected the struct keyword")?; let name = parse_identifier(tokens).unwrap_or_expected_at(tokens, "expected a struct name")?; let body = parse_fields(tokens)?; @@ -420,6 +437,7 @@ fn parse_struct_inner<'a>(tokens: &mut Vec>, visibility: Visibility) - } Ok(StructOrEnum { + doc, visibility, name, body, @@ -427,30 +445,34 @@ fn parse_struct_inner<'a>(tokens: &mut Vec>, visibility: Visibility) - } pub fn parse_struct<'a>(tokens: &mut Vec>) -> Result> { + let doc = parse_doc_comments(tokens); let visibility = parse_visibility(tokens); - parse_struct_inner(tokens, visibility) + parse_struct_inner(tokens, visibility, doc) } pub fn parse_enum_variant<'a>(tokens: &mut Vec>) -> Result> { // let visibility = parse_visibility(tokens); // eat_token(tokens, Symbol("enum")).unwrap_or_expected_at(tokens, "expected the enum keyword")?; + let doc = parse_doc_comments(tokens); let name = parse_identifier(tokens).unwrap_or_expected_at(tokens, "expected a enum name")?; let fields = parse_fields(tokens)?; Ok(EnumVariant { + doc, name, fields, }) } -fn parse_enum_inner<'a>(tokens: &mut Vec>, visibility: Visibility) -> Result> { +fn parse_enum_inner<'a>(tokens: &mut Vec>, visibility: Visibility, doc: Box<[&'a str]>) -> Result> { eat_token(tokens, Symbol("enum")).unwrap_or_expected_at(tokens, "expected the enum keyword")?; let name = parse_identifier(tokens).unwrap_or_expected_at(tokens, "expected a enum name")?; let body = parse_group(tokens, parse_enum_variant, BraceOpen, BraceClose) .when_parsing("the enum body")?; Ok(StructOrEnum { + doc, visibility, name, body, @@ -458,9 +480,10 @@ fn parse_enum_inner<'a>(tokens: &mut Vec>, visibility: Visibility) -> } pub fn parse_enum<'a>(tokens: &mut Vec>) -> Result> { + let doc = parse_doc_comments(tokens); let visibility = parse_visibility(tokens); - parse_enum_inner(tokens, visibility) + parse_enum_inner(tokens, visibility, doc) } pub fn parse_field_init<'a>(tokens: &mut Vec>) -> Result<(Identifier<'a>, Expression<'a>)> { @@ -700,7 +723,7 @@ fn parse_return_type<'a>(tokens: &mut Vec>) -> Result> }) } -fn parse_function_inner<'a>(tokens: &mut Vec>, visibility: Visibility) -> Result> { +fn parse_function_inner<'a>(tokens: &mut Vec>, visibility: Visibility, doc: Box<[&'a str]>) -> Result> { eat_token(tokens, Symbol("fn")).unwrap_or_expected_at(tokens, "expected the `fn` keyword")?; let name = parse_identifier(tokens).unwrap_or_expected_at(tokens, "expected a function name")?; let parameters = parse_group(tokens, parse_parameter, ParenthesisOpen, ParenthesisClose) @@ -710,6 +733,7 @@ fn parse_function_inner<'a>(tokens: &mut Vec>, visibility: Visibility) Ok(FunctionDeclaration { prototype: FunctionPrototype { + doc, visibility, name, parameters, @@ -720,9 +744,10 @@ fn parse_function_inner<'a>(tokens: &mut Vec>, visibility: Visibility) } pub fn parse_function_declaration<'a>(tokens: &mut Vec>) -> Result> { + let doc = parse_doc_comments(tokens); let visibility = parse_visibility(tokens); - parse_function_inner(tokens, visibility) + parse_function_inner(tokens, visibility, doc) } pub enum Declaration<'a> { @@ -732,12 +757,13 @@ pub enum Declaration<'a> { } pub fn parse_declaration<'a>(tokens: &mut Vec>) -> Result> { + let doc = parse_doc_comments(tokens); let visibility = parse_visibility(tokens); Ok(match peek_token(tokens) { - Some(Symbol("enum")) => Declaration::Enum(parse_enum_inner(tokens, visibility)?), - Some(Symbol("fn")) => Declaration::Function(parse_function_inner(tokens, visibility)?), - Some(Symbol("struct")) => Declaration::Struct(parse_struct_inner(tokens, visibility)?), + Some(Symbol("enum")) => Declaration::Enum(parse_enum_inner(tokens, visibility, doc)?), + Some(Symbol("fn")) => Declaration::Function(parse_function_inner(tokens, visibility, doc)?), + Some(Symbol("struct")) => Declaration::Struct(parse_struct_inner(tokens, visibility, doc)?), kind => Err(format!("expected a struct, enum or function declaration, got {kind:?}"))? }) } @@ -747,6 +773,9 @@ pub fn parse_declaration<'a>(tokens: &mut Vec>) -> Result Result<()> { let mut tokens = Token::parse_ascii(r#"pub struct Token { struct: TokenKind, + + /// Unlike Rust, Skrull supports keywords as identifiers natively ! :D + /// (not everywhere actually but it's still cool ^^') pub pub: usize, }"#)?; @@ -754,15 +783,21 @@ fn it_tokenize_struct_with_keywords_as_identifiers() -> Result<()> { assert_eq!( Struct { + doc: Box::new([]), visibility: Visibility::Pub, name: Identifier("Token"), body: Fields::NamedFields(vec![ NamedField { + doc: Box::new([]), visibility: Visibility::Default, name: Identifier("struct"), typ: Type::Identifier(Identifier("TokenKind")), }, NamedField { + doc: Box::new([ + " Unlike Rust, Skrull supports keywords as identifiers natively ! :D", + " (not everywhere actually but it's still cool ^^')" + ]), visibility: Visibility::Pub, name: Identifier("pub"), typ: Type::Usize, @@ -777,8 +812,14 @@ fn it_tokenize_struct_with_keywords_as_identifiers() -> Result<()> { #[test] fn it_tokenize_enum() -> Result<()> { - let mut tokens = /*language=rust*/Token::parse_ascii(r#"pub enum TokenKind { + let mut tokens = /*language=rust*/Token::parse_ascii(r#" +/// The kind of token that can be encountered +/// in this language +// usually not `Unexpected` +pub enum TokenKind { Equal, + + /// Should always generate an error when present Unexpected { character: char } }"#)?; @@ -786,17 +827,26 @@ fn it_tokenize_enum() -> Result<()> { assert_eq!( Enum { + doc: Box::new([ + " The kind of token that can be encountered", + " in this language" + ]), visibility: Visibility::Pub, name: Identifier("TokenKind"), body: vec![ EnumVariant { + doc: Box::new([]), name: Identifier("Equal"), fields: Fields::Unit, }, EnumVariant { + doc: Box::new([ + " Should always generate an error when present" + ]), name: Identifier("Unexpected"), fields: Fields::NamedFields(vec![ NamedField { + doc: Box::new([]), visibility: Visibility::Default, name: Identifier("character"), typ: Type::Identifier(Identifier("char")), @@ -849,6 +899,7 @@ fn it_tokenize_function_declaration() -> Result<()> { assert_eq!( FunctionDeclaration { prototype: FunctionPrototype { + doc: Box::new([]), visibility: Visibility::Pub, name: Identifier("ten"), parameters: vec![VariableSymbolDeclaration { diff --git a/src/printer/java.rs b/src/printer/java.rs index b553fc1..55821cc 100644 --- a/src/printer/java.rs +++ b/src/printer/java.rs @@ -2,6 +2,7 @@ use alloc::{format, vec}; use alloc::borrow::Cow; +use alloc::boxed::Box; use alloc::string::String; use core::fmt::{Debug, Display, Formatter}; @@ -702,15 +703,18 @@ enum TokenKind { assert_eq!( Struct { + doc: Box::new([]), visibility: Visibility::Pub, name: Identifier("Token"), body: Fields::NamedFields(vec![ NamedField { + doc: Box::new([]), visibility: Visibility::Default, name: Identifier("struct"), typ: Type::Identifier(Identifier("TokenKind")), }, NamedField { + doc: Box::new([]), visibility: Visibility::Pub, name: Identifier("pub"), typ: Type::Usize, diff --git a/src/printer/rust.rs b/src/printer/rust.rs index 6343f3f..a837bbb 100644 --- a/src/printer/rust.rs +++ b/src/printer/rust.rs @@ -1,5 +1,6 @@ use alloc::{format, vec}; use alloc::borrow::Cow; +use alloc::boxed::Box; use alloc::rc::Rc; use alloc::string::String; use core::fmt::{Debug, Display, Formatter}; @@ -236,15 +237,18 @@ enum TokenKind { assert_eq!( Struct { + doc: Box::new([]), visibility: Visibility::Pub, name: Identifier("Token"), body: Fields::NamedFields(vec![ NamedField { + doc: Box::new([]), visibility: Visibility::Default, name: Identifier("struct"), typ: Type::Identifier(Identifier("TokenKind")), }, NamedField { + doc: Box::new([]), visibility: Visibility::Pub, name: Identifier("pub"), typ: Type::Usize, @@ -284,6 +288,7 @@ pub fn life(mut unused: u32) -> i64 { assert_eq!( FunctionDeclaration { + doc: Box::new([]), prototype: FunctionPrototype { visibility: Visibility::Pub, name: Identifier("life"),