From e20ad299f3014b9cbb221fbacaec4dbe32e4cca4 Mon Sep 17 00:00:00 2001
From: d0rianb <dorian.beauchesne@icloud.com>
Date: Sun, 14 Apr 2024 12:02:17 +0200
Subject: [PATCH 1/3] Improve README.md spacing

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8f2f188..2e2f9f8 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 [![docs.rs](https://img.shields.io/docsrs/rtf-parser?style=flat-square)](https://docs.rs/rtf-parser)
 
 A safe Rust RTF parser &amp; lexer library designed for speed and memory efficiency, with no external dependencies.
+  
 The official documentation is available at [docs.rs/rtf-parser](https://docs.rs/rtf-parser).
 
 ## Installation
@@ -51,7 +52,8 @@ The `RtfDocument` struct implement the `TryFrom` trait for :
 
 and a `from_filepath` constructor that handle the i/o internally. 
 
-The error returned can be a `LexerError` or a `ParserError` depending on the phase wich failed.   
+The error returned can be a `LexerError` or a `ParserError` depending on the phase wich failed.  
+
 
 An `RtfDocument` is composed with : 
 - the **header**, containing among others the font table, the color table and the encoding.
@@ -166,6 +168,7 @@ However, the `rtf-grimoire` crate provide a similar *Lexer*. Here is a quick ben
 | [`rtf-grimoire`](https://crates.io/crates/rtf-grimoire) (only lexing) | v0.2.1  | _123 ms_ |
 
 *This benchmark has been made on an Intel MacBook Pro*.  
+
 For the `rtf-parser`, most of the compute time (_65 %_) is spent by the lexing process. There is still lot of room for improvement.  
 
 

From 72b653b6cbd0d2e7560badbcb5e4bdfb275c817e Mon Sep 17 00:00:00 2001
From: d0rianb <dorian.beauchesne@icloud.com>
Date: Sun, 14 Apr 2024 17:05:12 +0200
Subject: [PATCH 2/3] Rework the header parsing to only consume the known
 tables

---
 src/parser.rs | 71 ++++++++++++++++++++-------------------------------
 1 file changed, 28 insertions(+), 43 deletions(-)

diff --git a/src/parser.rs b/src/parser.rs
index 8c2e517..5df6d52 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -13,10 +13,10 @@ use crate::tokens::{ControlWord, Property, Token};
 // Use to specify control word in parse_header
 macro_rules! header_control_word {
     ($cw:ident) => {
-        Token::ControlSymbol((ControlWord::$cw, _))
+        &Token::ControlSymbol((ControlWord::$cw, _))
     };
     ($cw:ident, $prop:ident) => {
-        Token::ControlSymbol((ControlWord::$cw, Property::$prop))
+        &Token::ControlSymbol((ControlWord::$cw, Property::$prop))
     };
 }
 
@@ -246,59 +246,39 @@ impl<'a> Parser<'a> {
         }
         return ret;
     }
+    
+    // Consume all the tokens inside a group ({ ... }) and returns the includes ones
+    fn consume_group(&mut self) -> Vec<Token<'a>> {
+        // TODO: check the the token at cursor is indeed an OpeningBracket
+        self.consume_token_at(self.cursor); // Consume the opening bracket
+        return self.consume_tokens_until_matching_bracket();
+    }
 
     // Consume all tokens until the header is read
     fn parse_header(&mut self) -> Result<RtfHeader, ParserError> {
         self.cursor = 0; // Reset the cursor
         let mut header = RtfHeader::default();
-        while let (token, next_token) = (self.consume_next_token(), self.get_next_token()) {
+        while let (Some(token), Some(next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
             match (token, next_token) {
-                (Some(Token::OpeningBracket), Some(&header_control_word!(FontTable, None))) => {
-                    let font_table_tokens = self.consume_tokens_until_matching_bracket();
+                (Token::OpeningBracket, header_control_word!(FontTable, None)) => {
+                    let font_table_tokens = self.consume_group();
                     header.font_table = Self::parse_font_table(&font_table_tokens)?;
-                    // After the font table, check if next token is plain text without consuming it. If so, break
-                    if let Some(&Token::PlainText(_text)) = self.get_next_token() {
-                        break;
-                    }
                 }
-                (Some(Token::OpeningBracket), Some(&header_control_word!(ColorTable, None))) => {
-                    let color_table_tokens = self.consume_tokens_until_matching_bracket();
+                (Token::OpeningBracket, header_control_word!(ColorTable, None)) => {
+                    let color_table_tokens = self.consume_group();
                     header.color_table = Self::parse_color_table(&color_table_tokens)?;
-                    // After the color table, check if next token is plain text without consuming it. If so, break
-                    if let Some(&Token::PlainText(_text)) = self.get_next_token() {
-                        break;
-                    }
                 }
-                (Some(Token::OpeningBracket), Some(&header_control_word!(StyleSheet, None))) => {
-                    let stylesheet_tokens = self.consume_tokens_until_matching_bracket();
+                (Token::OpeningBracket, header_control_word!(StyleSheet, None)) => {
+                    let stylesheet_tokens = self.consume_group();
                     header.stylesheet = Self::parse_stylesheet(&stylesheet_tokens)?;
-                    // After the stylesheet, check if next token is plain text without consuming it. If so, break
-                    if let Some(&Token::PlainText(_text)) = self.get_next_token() {
-                        break;
-                    }
-                }
-                // Break on par, pard, sectd, or plain - We no longer are in the header
-                (Some(header_control_word!(Pard) | header_control_word!(Sectd) | header_control_word!(Plain) | header_control_word!(Par)), _) => break,
-                // Break if it declares a font after the font table --> no more in the header
-                (Some(header_control_word!(FontNumber)), _) => {
-                    if !header.font_table.is_empty() {
-                        break;
-                    }
                 }
                 // Check and consume token
-                (Some(ref token), _) => {
+                (token, _) => {
                     if let Some(charset) = CharacterSet::from(token) {
                         header.character_set = charset;
                     }
+                    self.cursor += 1;
                 }
-                // Check next without consuming token : break conditions
-                (_, Some(token)) => {
-                    // Break on plain text not belonging to any table in the header
-                    if let Token::PlainText(_text) = token {
-                        break;
-                    }
-                }
-                (None, None) => break,
             }
         }
         return Ok(header);
@@ -308,7 +288,7 @@ impl<'a> Parser<'a> {
         let Some(font_table_first_token) = font_tables_tokens.get(0) else {
             return Err(ParserError::NoMoreToken);
         };
-        if font_table_first_token != &header_control_word!(FontTable, None) {
+        if font_table_first_token != header_control_word!(FontTable, None) {
             return Err(ParserError::InvalidToken(format!("{:?} is not a FontTable token", font_table_first_token)));
         }
         let mut table = HashMap::new();
@@ -349,7 +329,7 @@ impl<'a> Parser<'a> {
         let Some(color_table_first_token) = color_table_tokens.get(0) else {
             return Err(ParserError::NoMoreToken);
         };
-        if color_table_first_token != &header_control_word!(ColorTable, None) {
+        if color_table_first_token != header_control_word!(ColorTable, None) {
             return Err(ParserError::InvalidToken(format!("ParserError: {:?} is not a ColorTable token", color_table_first_token)));
         }
         let mut table = HashMap::new();
@@ -398,9 +378,7 @@ impl<'a> Parser<'a> {
                     self.consume_token_at(self.cursor); // Consume the opening bracket
                     self.consume_tokens_until_matching_bracket();
                 }
-                _ => {
-                    self.cursor += 1;
-                }
+                _ => { self.cursor += 1; }
             }
         }
     }
@@ -669,4 +647,11 @@ pub mod tests {
         let document = Parser::new(tokens).parse().unwrap();
         assert_eq!(&document.body[0].text, "啊 啊");
     }
+    
+    #[test]
+    fn body_starts_with_a_group() {
+        let rtf = r"{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Calibri;}{\f1\fnil\fcharset2 Symbol;}}{\colortbl ;}{\pard \u21435  \sb70\par}}";
+        let tokens = Lexer::scan(rtf).unwrap();
+        let document = Parser::new(tokens).parse().unwrap();
+    }
 }

From 87dd799e7e59530c21ec86ab5d1e944d1534941b Mon Sep 17 00:00:00 2001
From: d0rianb <dorian.beauchesne@icloud.com>
Date: Sun, 14 Apr 2024 17:39:10 +0200
Subject: [PATCH 3/3] Merge parse_ignorable_destination & parse_header for
 performance

---
 src/parser.rs | 55 +++++++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/src/parser.rs b/src/parser.rs
index 5df6d52..a206db9 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -105,9 +105,9 @@ impl<'a> Parser<'a> {
     pub fn parse(&mut self) -> Result<RtfDocument, ParserError> {
         self.check_document_validity()?;
         let mut document = RtfDocument::default(); // Init empty document
-        self.parse_ignore_groups(); // delete the ignore groups
+        // Traverse the document and consume the header groups (FontTable, StyleSheet, etc ...)
         document.header = self.parse_header()?;
-        // Parse Body
+        // Parse the body
         let mut painter_stack: Vec<Painter> = vec![Painter::default()];
         let mut paragraph = Paragraph::default();
         let mut it = self.tokens.iter();
@@ -246,7 +246,7 @@ impl<'a> Parser<'a> {
         }
         return ret;
     }
-    
+
     // Consume all the tokens inside a group ({ ... }) and returns the includes ones
     fn consume_group(&mut self) -> Vec<Token<'a>> {
         // TODO: check the the token at cursor is indeed an OpeningBracket
@@ -258,8 +258,24 @@ impl<'a> Parser<'a> {
     fn parse_header(&mut self) -> Result<RtfHeader, ParserError> {
         self.cursor = 0; // Reset the cursor
         let mut header = RtfHeader::default();
-        while let (Some(token), Some(next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
+        while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
+            
+            // Manage the case where there is CRLF between { and control_word
+            // {\n /*/ignoregroup }
+            let mut i = 0;
+            while *next_token == Token::CRLF {
+                if let Some(next_token_not_crlf) = self.get_token_at(self.cursor + 1 + i) {
+                    next_token = next_token_not_crlf;
+                    i += 1;
+                } else {
+                    break;
+                }
+            }
             match (token, next_token) {
+                (Token::OpeningBracket, Token::IgnorableDestination) => {
+                    let ignore_group_tokens = self.consume_group();
+                    Self::parse_ignore_groups(&ignore_group_tokens);
+                }
                 (Token::OpeningBracket, header_control_word!(FontTable, None)) => {
                     let font_table_tokens = self.consume_group();
                     header.font_table = Self::parse_font_table(&font_table_tokens)?;
@@ -353,34 +369,13 @@ impl<'a> Parser<'a> {
         return Ok(table);
     }
 
-    fn parse_stylesheet(stylesheet_tokens: &Vec<Token<'a>>) -> Result<StyleSheet, ParserError> {
-        // TODO
+    fn parse_stylesheet(_stylesheet_tokens: &Vec<Token<'a>>) -> Result<StyleSheet, ParserError> {
+        // TODO : parse the stylesheet
         return Ok(StyleSheet::from([]));
     }
 
-    // Traverse all the tokens and consume the ignore groups
-    fn parse_ignore_groups(&mut self) {
-        self.cursor = 0; // Reset the cursor
-        while let (Some(token), Some(mut next_token)) = (self.get_token_at(self.cursor), self.get_token_at(self.cursor + 1)) {
-            let mut i = 0;
-            // Manage the case where there is CRLF between { and ignore_group
-            // {\n /*/ignoregroup }
-            while *next_token == Token::CRLF {
-                if let Some(next_token_not_crlf) = self.get_token_at(self.cursor + 1 + i) {
-                    next_token = next_token_not_crlf;
-                    i += 1;
-                } else {
-                    break;
-                }
-            }
-            match (token, next_token) {
-                (Token::OpeningBracket, Token::IgnorableDestination) => {
-                    self.consume_token_at(self.cursor); // Consume the opening bracket
-                    self.consume_tokens_until_matching_bracket();
-                }
-                _ => { self.cursor += 1; }
-            }
-        }
+    fn parse_ignore_groups(_tokens: &Vec<Token<'a>>) {
+        // Do nothing for now
     }
 }
 
@@ -647,7 +642,7 @@ pub mod tests {
         let document = Parser::new(tokens).parse().unwrap();
         assert_eq!(&document.body[0].text, "啊 啊");
     }
-    
+
     #[test]
     fn body_starts_with_a_group() {
         let rtf = r"{\rtf1\ansi\deff0{\fonttbl {\f0\fnil\fcharset0 Calibri;}{\f1\fnil\fcharset2 Symbol;}}{\colortbl ;}{\pard \u21435  \sb70\par}}";