From 8910d5a8a025ee589ba2526e2dfa8b2ec86f65f5 Mon Sep 17 00:00:00 2001 From: adamhutchings Date: Wed, 26 Jun 2024 00:22:21 -0400 Subject: [PATCH 1/6] Replace all getc/ungetc with new wrapper --- src/lexer/lex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 0fd7146..15c2569 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -248,11 +248,11 @@ int real_lex(Lexer *l, Token *t) { // return./ if (starts_operator(init)) { while (valid_operator_sequence(t->contents)) { - t->contents[pos++] = (c = getc(l->fp)); + t->contents[pos++] = (c = lexer_getchar(l->fp)); } // We've ended! // Can we reduce this code duplication from above in a smart way? - ungetc(c, l->fp); + lexer_ungetchar(l); t->contents[pos - 1] = '\0'; t->type = ttype_from_string(t->contents); t->length = pos; From 7fd0973e21e747f0370a724998984111f522a1e0 Mon Sep 17 00:00:00 2001 From: Jake Date: Tue, 25 Jun 2024 22:40:45 -0700 Subject: [PATCH 2/6] Add basic parsing code --- src/codegen/x86/codegen.c | 50 ++++++++++++--------- src/codegen/x86/codegen.h | 2 + src/driver/main.c | 35 ++++++++------- src/parser/parse.c | 94 +++++++++++++++++++++++++++++++++++++++ src/parser/parse.h | 5 +++ tests/simplemain.c | 3 ++ 6 files changed, 153 insertions(+), 36 deletions(-) create mode 100644 src/parser/parse.c create mode 100644 src/parser/parse.h create mode 100644 tests/simplemain.c diff --git a/src/codegen/x86/codegen.c b/src/codegen/x86/codegen.c index e0b317d..fae49b3 100644 --- a/src/codegen/x86/codegen.c +++ b/src/codegen/x86/codegen.c @@ -21,41 +21,49 @@ void code_gen_init() { char *start_main() { static char start[256] = "\ -global _start\ -section .text\ -\ -_start:"; +global _start\n\ +section .text\n\ +\n\ +_start:\n"; return start; } char *end_main() { static char end[256] = "\ -mov rax, 60\ -mov rdi, 0\ -syscall"; + mov rax, 60\ + mov rdi, 0\ + syscall"; + + return end; +} + +char *end_main_custom_return(int val) { + char *end; + end = (char *)malloc(256 * sizeof(char)); + sprintf(end, " mov rax, 60\n mov rdi, %d\n syscall\n", val); return end; } char *start_func() { static char start[256] = "\ -sub rsp, 32\ -mov [rsp], r12\ -mov [rsp+8], r13\ -mov [rsp+16], r14\ -mov [rsp+24], r15"; + sub rsp, 32\ + mov [rsp], r12\ + mov [rsp+8], r13\ + mov [rsp+16], r14\ + mov [rsp+24], r15"; return start; } char *end_func() { static char end[256] = "\ -mov r12, [rsp]\ -mov r13, [rsp+8]\ -mov r14, [rsp+16]\ -mov r15, [rsp+24]\ -add rsp, 32"; + mov r12, [rsp]\ + mov r13, [rsp+8]\ + mov r14, [rsp+16]\ + mov r15, [rsp+24]\ + add rsp, 32"; return end; } @@ -65,16 +73,16 @@ char *init_int_literal(int val) { char *init; init = (char *)malloc(256 * sizeof(char)); - sprintf(init, "mov [rsp+%d], %d", GEN_STATE.rsp_offset, val); + sprintf(init, " mov [rsp+%d], %d", GEN_STATE.rsp_offset, val); return init; } int test_init_int_literal() { - testing_func_setup(); + testing_func_setup(); code_gen_init(); - tassert(strcmp(init_int_literal(100), "mov [rsp+8], 100") == 0); + tassert(strcmp(init_int_literal(100), " mov [rsp+8], 100") == 0); - return 0; + return 0; } diff --git a/src/codegen/x86/codegen.h b/src/codegen/x86/codegen.h index 4af672d..5e3a489 100644 --- a/src/codegen/x86/codegen.h +++ b/src/codegen/x86/codegen.h @@ -8,6 +8,8 @@ char *end_main(); char *start_func(); +char *end_main_custom_return(int val); + char *end_func(); char *init_int_literal(int val); diff --git a/src/driver/main.c b/src/driver/main.c index 42ee68b..4a7d3ea 100644 --- a/src/driver/main.c +++ b/src/driver/main.c @@ -6,13 +6,14 @@ #include // strcmp #include +#include #include -int lexer_dump(const char* filename) { +int lexer_dump(const char *filename) { // Initialization of everything Lexer lexer; - FILE * fp = fopen(filename, "r"); + FILE *fp = fopen(filename, "r"); if (!fp) { PRINT_ERROR("File %s not found", filename); return 1; @@ -24,12 +25,13 @@ int lexer_dump(const char* filename) { Token t; do { // Return if some non-zero (error) code is returned - if (lex(&lexer, &t)) return 1; - printf("Contents: %20s, type: %20s, position: %d/%d\n", t.contents, ttype_name(t.type), t.line, t.column); + if (lex(&lexer, &t)) + return 1; + printf("Contents: %20s, type: %20s, position: %d/%d\n", t.contents, + ttype_name(t.type), t.line, t.column); } while (t.type != TT_EOF); return 0; - } int main(int argc, char **argv) { @@ -38,14 +40,16 @@ int main(int argc, char **argv) { // Skip the name of the executable. --argc, ++argv; - + if (argc == 0) { PRINT_DEFAULT("Usage: --token-dump to see all tokens"); return 0; } if (argc == 1) { - PRINT_DEFAULT("default compilation not supported yet -- try 'jccc --token-dump %s' instead.", argv[0]); + PRINT_DEFAULT("default compilation not supported yet -- try 'jccc " + "--token-dump %s' instead.", + argv[0]); return 1; } @@ -55,13 +59,14 @@ int main(int argc, char **argv) { } // Two arguments now. - if (strcmp(argv[0], "--token-dump")) { - PRINT_ERROR("option %s not recognized.", argv[1]); - return 1; - } - - // Finally, we can do the lexer test properly! - return lexer_dump(argv[1]); + if (strcmp(argv[0], "--token-dump") == 0) { + // Finally, we can do the lexer test properly! + return lexer_dump(argv[1]); + } else if (strcmp(argv[0], "--test-parse") == 0) { + parse(argv[1]); + return 0; + } - return 0; + PRINT_ERROR("option %s not recognized.", argv[1]); + return 1; } diff --git a/src/parser/parse.c b/src/parser/parse.c new file mode 100644 index 0000000..d1da408 --- /dev/null +++ b/src/parser/parse.c @@ -0,0 +1,94 @@ +/* Parser + * + */ + +#include +#include +#include +#include // calloc +#include // strcmp +#include + +int parse(const char *filename) { + + Lexer lexer; + + FILE *fp = fopen(filename, "r"); + + if (!fp) { + PRINT_ERROR("File %s not found", filename); + return 1; + } + + lexer.fp = fp; + lexer.unlexed_count = 0; + lexer.column = lexer.line = 1; + + Token t; + + int i = 0; + int buffer_size = 16; + Token *tokens = calloc(buffer_size, sizeof(Token)); + + do { + if (lex(&lexer, &t)) { + return 1; + } + + if (buffer_size <= i) { + buffer_size *= 2; + tokens = calloc(buffer_size, sizeof(Token)); + } + + tokens[i] = t; + + printf("Contents: %20s, type: %20s, position: %d/%d\n", t.contents, + ttype_name(t.type), t.line, t.column); + + i++; + } while (t.type != TT_EOF); + + // Main function + if (tokens[0].type == TT_INT && tokens[1].type == TT_IDENTIFIER && + (strcmp(tokens[1].contents, "main") == 0)) { + + // Correct empty function body + if (tokens[2].type == TT_OPAREN && tokens[3].type == TT_CPAREN && + tokens[4].type == TT_OBRACE) { + + // Return value + if (tokens[5].type == TT_RETURN && tokens[6].type == TT_LITERAL && + tokens[7].type == TT_SEMI) { + + // Correct matched closed brace + if (tokens[8].type == TT_CBRACE) { + printf("\n"); + + // Generate preamble main code + char *code_start = start_main(); + + printf(code_start); + + // Add custom return code + char *code_end = + end_main_custom_return(atoi(tokens[6].contents)); + + printf(code_end); + + } else { + PRINT_ERROR("Wrong closing brace.\n"); + } + } else { + PRINT_ERROR("Return value is wrong.\n"); + } + } else { + PRINT_ERROR("Wrong main function body.\n"); + } + } else { + PRINT_ERROR("Not correct main function.\n"); + } + + return 0; +} + +int parse_simple_main_func() {} diff --git a/src/parser/parse.h b/src/parser/parse.h new file mode 100644 index 0000000..8ea9ad8 --- /dev/null +++ b/src/parser/parse.h @@ -0,0 +1,5 @@ +/* Parser + * + */ + +int parse(const char *filename); diff --git a/tests/simplemain.c b/tests/simplemain.c new file mode 100644 index 0000000..aa5fd90 --- /dev/null +++ b/tests/simplemain.c @@ -0,0 +1,3 @@ +int main() { + return 5; +} From ef78539671f7566417acb0501a5d6ef1f284f3a3 Mon Sep 17 00:00:00 2001 From: adamhutchings Date: Wed, 26 Jun 2024 13:19:32 -0400 Subject: [PATCH 3/6] Add structures we'll need for the first parser iteration --- src/parser/cst.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 src/parser/cst.h diff --git a/src/parser/cst.h b/src/parser/cst.h new file mode 100644 index 0000000..59fa469 --- /dev/null +++ b/src/parser/cst.h @@ -0,0 +1,62 @@ +/** + * The structures for a concrete syntax tree. + * For now, the subset of C we are parsing is quite simple: + * - Parameterless functions. + * - Return statements, which accept integers or function calls. + */ + +#pragma once + +#include "list.h" + +// A list of all node types. +typedef enum { + NT_STMT, + NT_EXPR, + NT_BLOCK_STMT, + NT_RETURN_STMT, + NT_FUNCDECL, + NT_FUNCCALL, + NT_LITERAL, +} NodeType; + +// A block statement is just a list of statements. +typedef struct { + List* stmts; // A list of Statement structs. +} BlockStatement; + +typedef struct { + // TODO -- add parameters whe we get there + BlockStatement body; + const char name[256]; // The actual name of the function. +} FunctionDeclaration; + +// An entire program is just a list of top level declarations. +// For now, such declarations are only functions. +typedef struct { + union { + FunctionDeclaration fd; + // VariableDeclaration vd; when we get there + } u; + NodeType type; +} TopLevelDeclaration; + +// Right now, a function call doesn't have any parameters so it's just the name +// of the function being called. +typedef struct { + const char name[256]; +} FunctionCall; + +// An expression for now is an integer or a function call. +typedef struct { + union { + FunctionCall fc; + const char literal[256]; + } u; + NodeType type; +} Expression; + +// Finally, an entire source file is a list of top-level declarations. +typedef struct { + List* decls; // list of TopLevelDeclaration +} ConcreteFileTree; From e669ebf8f2f0305d8a8ed04df118674b3af5bc9f Mon Sep 17 00:00:00 2001 From: adamhutchings Date: Wed, 26 Jun 2024 13:21:48 -0400 Subject: [PATCH 4/6] Remove consts (because we need to set these names somehow) --- src/parser/cst.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parser/cst.h b/src/parser/cst.h index 59fa469..3184cf5 100644 --- a/src/parser/cst.h +++ b/src/parser/cst.h @@ -28,7 +28,7 @@ typedef struct { typedef struct { // TODO -- add parameters whe we get there BlockStatement body; - const char name[256]; // The actual name of the function. + char name[256]; // The actual name of the function. } FunctionDeclaration; // An entire program is just a list of top level declarations. @@ -44,14 +44,14 @@ typedef struct { // Right now, a function call doesn't have any parameters so it's just the name // of the function being called. typedef struct { - const char name[256]; + char name[256]; } FunctionCall; // An expression for now is an integer or a function call. typedef struct { union { FunctionCall fc; - const char literal[256]; + char literal[256]; } u; NodeType type; } Expression; From 16ee5342f97fae19890b167119d0eb3b7959174e Mon Sep 17 00:00:00 2001 From: Jake Date: Wed, 26 Jun 2024 15:21:31 -0700 Subject: [PATCH 5/6] Close files and check for numeric --- src/driver/main.c | 2 ++ src/parser/parse.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/driver/main.c b/src/driver/main.c index 4a7d3ea..d963669 100644 --- a/src/driver/main.c +++ b/src/driver/main.c @@ -31,6 +31,8 @@ int lexer_dump(const char *filename) { ttype_name(t.type), t.line, t.column); } while (t.type != TT_EOF); + fclose(fp); + return 0; } diff --git a/src/parser/parse.c b/src/parser/parse.c index d1da408..5a7190c 100644 --- a/src/parser/parse.c +++ b/src/parser/parse.c @@ -7,6 +7,7 @@ #include #include // calloc #include // strcmp +#include // isdigit #include int parse(const char *filename) { @@ -58,18 +59,18 @@ int parse(const char *filename) { // Return value if (tokens[5].type == TT_RETURN && tokens[6].type == TT_LITERAL && - tokens[7].type == TT_SEMI) { + isdigit(tokens[6].contents[0]) && tokens[7].type == TT_SEMI) { // Correct matched closed brace if (tokens[8].type == TT_CBRACE) { - printf("\n"); - - // Generate preamble main code + printf("\n"); + + // Generate preamble main code char *code_start = start_main(); - printf(code_start); + printf(code_start); - // Add custom return code + // Add custom return code char *code_end = end_main_custom_return(atoi(tokens[6].contents)); @@ -88,6 +89,8 @@ int parse(const char *filename) { PRINT_ERROR("Not correct main function.\n"); } + fclose(fp); + return 0; } From 336bee13eca8dc6c445198575034b3741a4f985a Mon Sep 17 00:00:00 2001 From: adamhutchings Date: Thu, 27 Jun 2024 17:04:14 -0400 Subject: [PATCH 6/6] Add parser framework --- src/lexer/lex.c | 2 +- src/parser/cst.h | 2 +- src/parser/parse.c | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/lexer/lex.c b/src/lexer/lex.c index c94a9cc..7d6a61e 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -248,7 +248,7 @@ int real_lex(Lexer *l, Token *t) { // return./ if (starts_operator(init)) { while (valid_operator_sequence(t->contents)) { - t->contents[pos++] = (c = lexer_getchar(l->fp)); + t->contents[pos++] = (c = lexer_getchar(l)); } // We've ended! // Can we reduce this code duplication from above in a smart way? diff --git a/src/parser/cst.h b/src/parser/cst.h index 3184cf5..f68ec16 100644 --- a/src/parser/cst.h +++ b/src/parser/cst.h @@ -7,7 +7,7 @@ #pragma once -#include "list.h" +#include // A list of all node types. typedef enum { diff --git a/src/parser/parse.c b/src/parser/parse.c index 5a7190c..6176d52 100644 --- a/src/parser/parse.c +++ b/src/parser/parse.c @@ -2,12 +2,14 @@ * */ -#include -#include -#include #include // calloc #include // strcmp #include // isdigit + +#include +#include +#include +#include #include int parse(const char *filename) { @@ -95,3 +97,30 @@ int parse(const char *filename) { } int parse_simple_main_func() {} + +/** + * Proper parsing code below -- producing a concrete syntax tree from a file. + * Each of these functions will probably reference the others numerous times. + */ + +int parse_expr(Lexer* l, Expression* ex) { + // TODO (just a literal or a function call for now). +} + +int parse_funccall(Lexer* l, Expression* ex) { + // TODO +} + +int parse_blockstmt(Lexer* l, BlockStatement* bs) { + // TODO +} + +int parse_funcdecl(Lexer* l, FunctionDeclaration* fd) { + // TODO +} + +// Parse function -- takes a lexer and produces a concrete syntax tree. Fill the +// struct which we have given with the data. +int make_cst(Lexer* l, ConcreteFileTree* tree) { + // TODO +}