From b1a049f060a880cbdd405e11ab4843b037452c55 Mon Sep 17 00:00:00 2001 From: Jake Date: Mon, 24 Jun 2024 19:28:43 -0700 Subject: [PATCH 1/9] Add more lex tests --- src/lexer/lex.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index ca37c14..8d25600 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -25,10 +25,10 @@ int is_valid_numeric_or_id_char(char c) { return isalnum(c) || (c == '_') || (c == '.'); } -int real_lex(Lexer*, Token*); +int real_lex(Lexer *, Token *); /** - * This produces a list of tokens after having been processed by the + * This produces a list of tokens after having been processed by the * preprocessor. For example, if the code is * #define MAX_ARRAY 5 * int arr[MAX_ARRAY]; @@ -40,7 +40,7 @@ int real_lex(Lexer*, Token*); * ] * ; */ -int lex(Lexer* l, Token* t) { +int lex(Lexer *l, Token *t) { // For now, all we need to do is skip newlines for (;;) { real_lex(l, t); @@ -531,6 +531,9 @@ const char *ttype_name(TokenType tt) { return ttype_names[tt]; } int test_ttype_from_string() { testing_func_setup(); + tassert(ttype_from_string("+") == TT_PLUS); + tassert(ttype_from_string("=") == TT_ASSIGN); + tassert(ttype_from_string("1") == TT_LITERAL); tassert(ttype_from_string("1.2") == TT_LITERAL); From bfee00247ad539ece4aa321fe49db4690c247207 Mon Sep 17 00:00:00 2001 From: Jake Date: Mon, 24 Jun 2024 19:32:26 -0700 Subject: [PATCH 2/9] Add better docs --- src/lexer/lex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 8d25600..cfe8ba8 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -203,6 +203,7 @@ int skip_to_token(Lexer *l) { return -1; // EOF was reached } +// This is a function for parsing single char tokens TokenType ttype_one_char(char c) { switch (c) { case '(': @@ -256,6 +257,7 @@ TokenType ttype_one_char(char c) { return TT_NO_TOKEN; } +// This is a function for parsing exclusively tokens with more than one char TokenType ttype_many_chars(const char *contents) { if (STREQ(contents, "auto")) { return TT_AUTO; @@ -423,6 +425,7 @@ TokenType ttype_many_chars(const char *contents) { return TT_IDENTIFIER; } +// This is the function for parsing all tokens from strings TokenType ttype_from_string(const char *contents) { int len; From a076961acfc5ba70ad6f38277386563eb49855df Mon Sep 17 00:00:00 2001 From: Jake Date: Mon, 24 Jun 2024 19:39:26 -0700 Subject: [PATCH 3/9] Add assert for error --- src/lexer/lex.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index cfe8ba8..7615a2b 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -259,6 +259,10 @@ TokenType ttype_one_char(char c) { // This is a function for parsing exclusively tokens with more than one char TokenType ttype_many_chars(const char *contents) { + if (strlen(contents) == 1) { + PRINT_ERROR("Please use ttype_from_string instead of ttype_many_chars for general use of tokenizing"); + } + if (STREQ(contents, "auto")) { return TT_AUTO; } else if (STREQ(contents, "break")) { From 3a33bee5eb337e90f759de1ae6c89ce135086404 Mon Sep 17 00:00:00 2001 From: Jake Date: Mon, 24 Jun 2024 19:53:20 -0700 Subject: [PATCH 4/9] Add more tests --- src/lexer/lex.c | 40 ++++++++++++++++++++++++++++++++++++---- src/lexer/lex.h | 6 ++++++ src/lexer/test_lexer.c | 3 +++ 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 7615a2b..aa11533 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -259,10 +259,6 @@ TokenType ttype_one_char(char c) { // This is a function for parsing exclusively tokens with more than one char TokenType ttype_many_chars(const char *contents) { - if (strlen(contents) == 1) { - PRINT_ERROR("Please use ttype_from_string instead of ttype_many_chars for general use of tokenizing"); - } - if (STREQ(contents, "auto")) { return TT_AUTO; } else if (STREQ(contents, "break")) { @@ -535,6 +531,42 @@ static const char *ttype_names[] = { const char *ttype_name(TokenType tt) { return ttype_names[tt]; } +int test_ttype_many_chars() { + testing_func_setup(); + + tassert(ttype_many_chars("foo") == TT_IDENTIFIER); + tassert(ttype_many_chars("struct") == TT_STRUCT); + tassert(ttype_many_chars("while") == TT_WHILE); + + return 0; +} + +int test_ttype_one_char() { + testing_func_setup(); + + // Use ttype_from_string + tassert(ttype_one_char('a') == TT_NO_TOKEN); + tassert(ttype_one_char('1') == TT_NO_TOKEN); + + tassert(ttype_one_char('+') == TT_PLUS); + tassert(ttype_one_char('-') == TT_MINUS); + tassert(ttype_one_char('>') == TT_GREATER); + tassert(ttype_one_char('~') == TT_BNOT); + + return 0; +} + +int test_ttype_name() { + testing_func_setup(); + + tassert(strcmp(ttype_name(TT_LITERAL), "literal") == 0); + tassert(strcmp(ttype_name(TT_PLUS), "+") == 0); + tassert(strcmp(ttype_name(TT_SIZEOF), "sizeof") == 0); + tassert(strcmp(ttype_name(TT_WHILE), "while") == 0); + + return 0; +} + int test_ttype_from_string() { testing_func_setup(); diff --git a/src/lexer/lex.h b/src/lexer/lex.h index 8ed0489..016da90 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -40,3 +40,9 @@ const char *ttype_name(TokenType tt); // Test for ttype_from_string int test_ttype_from_string(); + +int test_ttype_many_chars(); + +int test_ttype_one_char(); + +int test_ttype_name(); diff --git a/src/lexer/test_lexer.c b/src/lexer/test_lexer.c index 83a3087..9364a22 100644 --- a/src/lexer/test_lexer.c +++ b/src/lexer/test_lexer.c @@ -8,7 +8,10 @@ int test_lexer() { testing_module_setup(); + test_ttype_name(); test_ttype_from_string(); + test_ttype_many_chars(); + test_ttype_one_char(); testing_module_cleanup(); return 0; From b9cc2de2fbf272461ebbab506b748722a1f7dda7 Mon Sep 17 00:00:00 2001 From: Jake Date: Tue, 25 Jun 2024 10:56:52 -0700 Subject: [PATCH 5/9] Change to underscore --- src/lexer/lex.c | 24 ++++++++++++------------ src/lexer/lex.h | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index aa11533..423939f 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -204,7 +204,7 @@ int skip_to_token(Lexer *l) { } // This is a function for parsing single char tokens -TokenType ttype_one_char(char c) { +TokenType _ttype_one_char(char c) { switch (c) { case '(': return TT_OPAREN; // ( @@ -258,7 +258,7 @@ TokenType ttype_one_char(char c) { } // This is a function for parsing exclusively tokens with more than one char -TokenType ttype_many_chars(const char *contents) { +TokenType _ttype_many_chars(const char *contents) { if (STREQ(contents, "auto")) { return TT_AUTO; } else if (STREQ(contents, "break")) { @@ -433,7 +433,7 @@ TokenType ttype_from_string(const char *contents) { // Single character contents if (len == 1) { - TokenType token = ttype_one_char(contents[0]); + TokenType token = _ttype_one_char(contents[0]); if (token != TT_NO_TOKEN) { return token; @@ -534,9 +534,9 @@ const char *ttype_name(TokenType tt) { return ttype_names[tt]; } int test_ttype_many_chars() { testing_func_setup(); - tassert(ttype_many_chars("foo") == TT_IDENTIFIER); - tassert(ttype_many_chars("struct") == TT_STRUCT); - tassert(ttype_many_chars("while") == TT_WHILE); + tassert(_ttype_many_chars("foo") == TT_IDENTIFIER); + tassert(_ttype_many_chars("struct") == TT_STRUCT); + tassert(_ttype_many_chars("while") == TT_WHILE); return 0; } @@ -545,13 +545,13 @@ int test_ttype_one_char() { testing_func_setup(); // Use ttype_from_string - tassert(ttype_one_char('a') == TT_NO_TOKEN); - tassert(ttype_one_char('1') == TT_NO_TOKEN); + tassert(_ttype_one_char('a') == TT_NO_TOKEN); + tassert(_ttype_one_char('1') == TT_NO_TOKEN); - tassert(ttype_one_char('+') == TT_PLUS); - tassert(ttype_one_char('-') == TT_MINUS); - tassert(ttype_one_char('>') == TT_GREATER); - tassert(ttype_one_char('~') == TT_BNOT); + tassert(_ttype_one_char('+') == TT_PLUS); + tassert(_ttype_one_char('-') == TT_MINUS); + tassert(_ttype_one_char('>') == TT_GREATER); + tassert(_ttype_one_char('~') == TT_BNOT); return 0; } diff --git a/src/lexer/lex.h b/src/lexer/lex.h index 016da90..f09e02c 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -23,10 +23,10 @@ int lex(Lexer *l, Token *token); int unlex(Lexer *l, Token *token); // Find token type for single char -TokenType ttype_one_char(char c); +TokenType _ttype_one_char(char c); // Find token type for a char* with multiple characters -TokenType ttype_many_chars(const char *contents); +TokenType _ttype_many_chars(const char *contents); // Discern the token type from the given contents of unknown length TokenType ttype_from_string(const char *contents); From 5fc13ba6e6cfd99469adb4c7e0f5c5b42846061e Mon Sep 17 00:00:00 2001 From: Jake Date: Tue, 25 Jun 2024 10:59:04 -0700 Subject: [PATCH 6/9] Rename --- src/lexer/lex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 0411cde..01b2f1f 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -169,7 +169,7 @@ int real_lex(Lexer *l, Token *t) { // First up, we can just end here. if (in_string(init, single_char_tokens)) { t->length = pos; - t->type = ttype_one_char(init); + t->type = _ttype_one_char(init); return 0; } @@ -196,7 +196,7 @@ int real_lex(Lexer *l, Token *t) { // We've ended! ungetc(c, l->fp); t->contents[pos] = '\0'; - t->type = ttype_many_chars(t->contents); + t->type = _ttype_many_chars(t->contents); t->length = pos; return 0; } @@ -331,7 +331,6 @@ TokenType _ttype_one_char(char c) { case '?': return TT_QMARK; default: - PRINT_ERROR("Token type for token '%c' not recognized", c); return TT_NO_TOKEN; } } @@ -521,7 +520,7 @@ TokenType ttype_from_string(const char *contents) { } } - return ttype_many_chars(contents); + return _ttype_many_chars(contents); } static const char *ttype_names[] = { From 74f6a0d092b67b33fcb4a4d8d155e587d22415c1 Mon Sep 17 00:00:00 2001 From: Jake Date: Tue, 25 Jun 2024 11:01:49 -0700 Subject: [PATCH 7/9] Revert "Rename" This reverts commit 5fc13ba6e6cfd99469adb4c7e0f5c5b42846061e. --- src/lexer/lex.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 01b2f1f..0411cde 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -169,7 +169,7 @@ int real_lex(Lexer *l, Token *t) { // First up, we can just end here. if (in_string(init, single_char_tokens)) { t->length = pos; - t->type = _ttype_one_char(init); + t->type = ttype_one_char(init); return 0; } @@ -196,7 +196,7 @@ int real_lex(Lexer *l, Token *t) { // We've ended! ungetc(c, l->fp); t->contents[pos] = '\0'; - t->type = _ttype_many_chars(t->contents); + t->type = ttype_many_chars(t->contents); t->length = pos; return 0; } @@ -331,6 +331,7 @@ TokenType _ttype_one_char(char c) { case '?': return TT_QMARK; default: + PRINT_ERROR("Token type for token '%c' not recognized", c); return TT_NO_TOKEN; } } @@ -520,7 +521,7 @@ TokenType ttype_from_string(const char *contents) { } } - return _ttype_many_chars(contents); + return ttype_many_chars(contents); } static const char *ttype_names[] = { From 29717793f464ddedbef0fb62efa4e29b872dff60 Mon Sep 17 00:00:00 2001 From: Jake Date: Tue, 25 Jun 2024 11:07:17 -0700 Subject: [PATCH 8/9] Change to expected behavior for ttype functions --- src/lexer/lex.c | 32 ++++++++++++++++++-------------- src/lexer/lex.h | 4 ++-- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 0411cde..3ec7760 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -278,7 +278,8 @@ int skip_to_token(Lexer *l) { } // This is a function for parsing single char tokens -TokenType _ttype_one_char(char c) { +// Now handles all cases of single char tokens +TokenType ttype_one_char(char c) { switch (c) { case '(': return TT_OPAREN; // ( @@ -331,13 +332,16 @@ TokenType _ttype_one_char(char c) { case '?': return TT_QMARK; default: - PRINT_ERROR("Token type for token '%c' not recognized", c); - return TT_NO_TOKEN; + if (isdigit(c)) { + return TT_LITERAL; + } else { + return TT_IDENTIFIER; + } } } // This is a function for parsing exclusively tokens with more than one char -TokenType _ttype_many_chars(const char *contents) { +TokenType ttype_many_chars(const char *contents) { if (STREQ(contents, "auto")) { return TT_AUTO; } else if (STREQ(contents, "break")) { @@ -514,7 +518,7 @@ TokenType ttype_from_string(const char *contents) { // Single character contents if (len == 1) { - TokenType token = _ttype_one_char(contents[0]); + TokenType token = ttype_one_char(contents[0]); if (token != TT_NO_TOKEN) { return token; @@ -618,9 +622,9 @@ const char *ttype_name(TokenType tt) { return ttype_names[tt]; } int test_ttype_many_chars() { testing_func_setup(); - tassert(_ttype_many_chars("foo") == TT_IDENTIFIER); - tassert(_ttype_many_chars("struct") == TT_STRUCT); - tassert(_ttype_many_chars("while") == TT_WHILE); + tassert(ttype_many_chars("foo") == TT_IDENTIFIER); + tassert(ttype_many_chars("struct") == TT_STRUCT); + tassert(ttype_many_chars("while") == TT_WHILE); return 0; } @@ -629,13 +633,13 @@ int test_ttype_one_char() { testing_func_setup(); // Use ttype_from_string - tassert(_ttype_one_char('a') == TT_NO_TOKEN); - tassert(_ttype_one_char('1') == TT_NO_TOKEN); + tassert(ttype_one_char('a') == TT_IDENTIFIER); + tassert(ttype_one_char('1') == TT_LITERAL); - tassert(_ttype_one_char('+') == TT_PLUS); - tassert(_ttype_one_char('-') == TT_MINUS); - tassert(_ttype_one_char('>') == TT_GREATER); - tassert(_ttype_one_char('~') == TT_BNOT); + tassert(ttype_one_char('+') == TT_PLUS); + tassert(ttype_one_char('-') == TT_MINUS); + tassert(ttype_one_char('>') == TT_GREATER); + tassert(ttype_one_char('~') == TT_BNOT); return 0; } diff --git a/src/lexer/lex.h b/src/lexer/lex.h index f09e02c..016da90 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -23,10 +23,10 @@ int lex(Lexer *l, Token *token); int unlex(Lexer *l, Token *token); // Find token type for single char -TokenType _ttype_one_char(char c); +TokenType ttype_one_char(char c); // Find token type for a char* with multiple characters -TokenType _ttype_many_chars(const char *contents); +TokenType ttype_many_chars(const char *contents); // Discern the token type from the given contents of unknown length TokenType ttype_from_string(const char *contents); From 07c772d059f3d54a5c87bae8dfae40c5a9623c6e Mon Sep 17 00:00:00 2001 From: Jake Date: Tue, 25 Jun 2024 11:37:43 -0700 Subject: [PATCH 9/9] Remove unreachable if --- src/lexer/lex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 3ec7760..a716de9 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -519,10 +519,7 @@ TokenType ttype_from_string(const char *contents) { // Single character contents if (len == 1) { TokenType token = ttype_one_char(contents[0]); - - if (token != TT_NO_TOKEN) { - return token; - } + return token; } return ttype_many_chars(contents);