Merge branch 'main' of https://github.com/jabacat/jccc

jabacat · Jul 8, 2024 · 1632e0f · 1632e0f
2 parents d0aa0ee + 3056da3
commit 1632e0f
Show file tree

Hide file tree

Showing 8 changed files with 251 additions and 38 deletions.
diff --git a/src/codegen/x86/codegen.c b/src/codegen/x86/codegen.c
@@ -21,41 +21,49 @@ void code_gen_init() {
 
 char *start_main() {
     static char start[256] = "\
-global _start\
-section .text\
-\
-_start:";
+global _start\n\
+section .text\n\
+\n\
+_start:\n";
 
     return start;
 }
 
 char *end_main() {
     static char end[256] = "\
-mov rax, 60\
-mov rdi, 0\
-syscall";
+	mov rax, 60\
+	mov rdi, 0\
+	syscall";
+
+    return end;
+}
+
+char *end_main_custom_return(int val) {
+    char *end;
+    end = (char *)malloc(256 * sizeof(char));
+    sprintf(end, "	mov rax, 60\n	mov rdi, %d\n	syscall\n", val);
 
     return end;
 }
 
 char *start_func() {
     static char start[256] = "\
-sub rsp, 32\
-mov [rsp], r12\
-mov [rsp+8], r13\
-mov [rsp+16], r14\
-mov [rsp+24], r15";
+	sub rsp, 32\
+	mov [rsp], r12\
+	mov [rsp+8], r13\
+	mov [rsp+16], r14\
+	mov [rsp+24], r15";
 
     return start;
 }
 
 char *end_func() {
     static char end[256] = "\
-mov r12, [rsp]\
-mov r13, [rsp+8]\
-mov r14, [rsp+16]\
-mov r15, [rsp+24]\
-add rsp, 32";
+	mov r12, [rsp]\
+	mov r13, [rsp+8]\
+	mov r14, [rsp+16]\
+	mov r15, [rsp+24]\
+	add rsp, 32";
 
     return end;
 }
@@ -65,16 +73,16 @@ char *init_int_literal(int val) {
 
     char *init;
     init = (char *)malloc(256 * sizeof(char));
-    sprintf(init, "mov [rsp+%d], %d", GEN_STATE.rsp_offset, val);
+    sprintf(init, "	mov [rsp+%d], %d", GEN_STATE.rsp_offset, val);
 
     return init;
 }
 
 int test_init_int_literal() {
-	testing_func_setup();
+    testing_func_setup();
     code_gen_init();
 
-    tassert(strcmp(init_int_literal(100), "mov [rsp+8], 100") == 0);
+    tassert(strcmp(init_int_literal(100), "	mov [rsp+8], 100") == 0);
 
-	return 0;
+    return 0;
 }
diff --git a/src/codegen/x86/codegen.h b/src/codegen/x86/codegen.h
@@ -8,6 +8,8 @@ char *end_main();
 
 char *start_func();
 
+char *end_main_custom_return(int val);
+
 char *end_func();
 
 char *init_int_literal(int val);

diff --git a/src/driver/main.c b/src/driver/main.c
@@ -6,13 +6,14 @@
 #include <string.h> // strcmp
 
 #include "../lexer/lex.h"
+#include "../parser/parse.h"
 #include "../util/out.h"
 
-int lexer_dump(const char* filename) {
+int lexer_dump(const char *filename) {
 
     // Initialization of everything
     Lexer lexer;
-    FILE * fp = fopen(filename, "r");
+    FILE *fp = fopen(filename, "r");
     if (!fp) {
         PRINT_ERROR("File %s not found", filename);
         return 1;
@@ -24,12 +25,15 @@ int lexer_dump(const char* filename) {
     Token t;
     do {
         // Return if some non-zero (error) code is returned
-        if (lex(&lexer, &t)) return 1;
-        printf("Contents: %20s, type: %20s, position: %d/%d\n", t.contents, ttype_name(t.type), t.line, t.column);
+        if (lex(&lexer, &t))
+            return 1;
+        printf("Contents: %20s, type: %20s, position: %d/%d\n", t.contents,
+               ttype_name(t.type), t.line, t.column);
     } while (t.type != TT_EOF);
 
-    return 0;
+	fclose(fp);
 
+    return 0;
 }
 
 int main(int argc, char **argv) {
@@ -38,14 +42,16 @@ int main(int argc, char **argv) {
 
     // Skip the name of the executable.
     --argc, ++argv;
-    
+
     if (argc == 0) {
         PRINT_DEFAULT("Usage: --token-dump <filename> to see all tokens");
         return 0;
     }
 
     if (argc == 1) {
-        PRINT_DEFAULT("default compilation not supported yet -- try 'jccc --token-dump %s' instead.", argv[0]);
+        PRINT_DEFAULT("default compilation not supported yet -- try 'jccc "
+                      "--token-dump %s' instead.",
+                      argv[0]);
         return 1;
     }
 
@@ -55,13 +61,14 @@ int main(int argc, char **argv) {
     }
 
     // Two arguments now.
-    if (strcmp(argv[0], "--token-dump")) {
-        PRINT_ERROR("option %s not recognized.", argv[1]);
-        return 1;
-    }
-
-    // Finally, we can do the lexer test properly!
-    return lexer_dump(argv[1]);
+    if (strcmp(argv[0], "--token-dump") == 0) {
+        // Finally, we can do the lexer test properly!
+        return lexer_dump(argv[1]);
+    } else if (strcmp(argv[0], "--test-parse") == 0) {
+		parse(argv[1]);
+		return 0;
+	}
 
-    return 0;
+	PRINT_ERROR("option %s not recognized.", argv[1]);
+	return 1;
 }
diff --git a/src/lexer/lex.c b/src/lexer/lex.c
@@ -248,11 +248,11 @@ int real_lex(Lexer *l, Token *t) {
     // return./
     if (starts_operator(init)) {
         while (valid_operator_sequence(t->contents)) {
-            t->contents[pos++] = (c = getc(l->fp));
+            t->contents[pos++] = (c = lexer_getchar(l));
         }
         // We've ended!
         // Can we reduce this code duplication from above in a smart way?
-        ungetc(c, l->fp);
+        lexer_ungetchar(l);
         t->contents[pos - 1] = '\0';
         t->type = ttype_from_string(t->contents);
         t->length = pos;

diff --git a/src/parser/cst.h b/src/parser/cst.h
@@ -0,0 +1,62 @@
+/**
+ * The structures for a concrete syntax tree.
+ * For now, the subset of C we are parsing is quite simple:
+ * - Parameterless functions.
+ * - Return statements, which accept integers or function calls.
+ */
+
+#pragma once
+
+#include <util/list.h>
+
+// A list of all node types.
+typedef enum {
+    NT_STMT,
+    NT_EXPR,
+    NT_BLOCK_STMT,
+    NT_RETURN_STMT,
+    NT_FUNCDECL,
+    NT_FUNCCALL,
+    NT_LITERAL,
+} NodeType;
+
+// A block statement is just a list of statements.
+typedef struct {
+    List* stmts; // A list of Statement structs.
+} BlockStatement;
+
+typedef struct {
+    // TODO -- add parameters whe we get there
+    BlockStatement body;
+    char name[256]; // The actual name of the function.
+} FunctionDeclaration;
+
+// An entire program is just a list of top level declarations.
+// For now, such declarations are only functions.
+typedef struct {
+    union {
+        FunctionDeclaration fd;
+        // VariableDeclaration vd; when we get there
+    } u;
+    NodeType type;
+} TopLevelDeclaration;
+
+// Right now, a function call doesn't have any parameters so it's just the name
+// of the function being called.
+typedef struct {
+    char name[256];
+} FunctionCall;
+
+// An expression for now is an integer or a function call.
+typedef struct {
+    union {
+        FunctionCall fc;
+        char literal[256];
+    } u;
+    NodeType type;
+} Expression;
+
+// Finally, an entire source file is a list of top-level declarations.
+typedef struct {
+    List* decls; // list of TopLevelDeclaration
+} ConcreteFileTree;
diff --git a/src/parser/parse.c b/src/parser/parse.c
@@ -0,0 +1,126 @@
+/* Parser
+ *
+ */
+
+#include <stdlib.h> // calloc
+#include <string.h> // strcmp
+#include <ctype.h> // isdigit
+
+#include <codegen/x86/codegen.h>
+#include <lexer/lex.h>
+#include <lexer/token.h>
+#include <parser/cst.h>
+#include <util/out.h>
+
+int parse(const char *filename) {
+
+    Lexer lexer;
+
+    FILE *fp = fopen(filename, "r");
+
+    if (!fp) {
+        PRINT_ERROR("File %s not found", filename);
+        return 1;
+    }
+
+    lexer.fp = fp;
+    lexer.unlexed_count = 0;
+    lexer.column = lexer.line = 1;
+
+    Token t;
+
+    int i = 0;
+    int buffer_size = 16;
+    Token *tokens = calloc(buffer_size, sizeof(Token));
+
+    do {
+        if (lex(&lexer, &t)) {
+            return 1;
+        }
+
+        if (buffer_size <= i) {
+            buffer_size *= 2;
+            tokens = calloc(buffer_size, sizeof(Token));
+        }
+
+        tokens[i] = t;
+
+        printf("Contents: %20s, type: %20s, position: %d/%d\n", t.contents,
+               ttype_name(t.type), t.line, t.column);
+
+        i++;
+    } while (t.type != TT_EOF);
+
+    // Main function
+    if (tokens[0].type == TT_INT && tokens[1].type == TT_IDENTIFIER &&
+        (strcmp(tokens[1].contents, "main") == 0)) {
+
+        // Correct empty function body
+        if (tokens[2].type == TT_OPAREN && tokens[3].type == TT_CPAREN &&
+            tokens[4].type == TT_OBRACE) {
+
+            // Return value
+            if (tokens[5].type == TT_RETURN && tokens[6].type == TT_LITERAL &&
+                isdigit(tokens[6].contents[0]) && tokens[7].type == TT_SEMI) {
+
+                // Correct matched closed brace
+                if (tokens[8].type == TT_CBRACE) {
+                    printf("\n");
+
+                    // Generate preamble main code
+                    char *code_start = start_main();
+
+                    printf(code_start);
+
+                    // Add custom return code
+                    char *code_end =
+                        end_main_custom_return(atoi(tokens[6].contents));
+
+                    printf(code_end);
+
+                } else {
+                    PRINT_ERROR("Wrong closing brace.\n");
+                }
+            } else {
+                PRINT_ERROR("Return value is wrong.\n");
+            }
+        } else {
+            PRINT_ERROR("Wrong main function body.\n");
+        }
+    } else {
+        PRINT_ERROR("Not correct main function.\n");
+    }
+
+    fclose(fp);
+
+    return 0;
+}
+
+int parse_simple_main_func() {}
+
+/**
+ * Proper parsing code below -- producing a concrete syntax tree from a file.
+ * Each of these functions will probably reference the others numerous times.
+ */
+
+int parse_expr(Lexer* l, Expression* ex) {
+    // TODO (just a literal or a function call for now).
+}
+
+int parse_funccall(Lexer* l, Expression* ex) {
+    // TODO
+}
+
+int parse_blockstmt(Lexer* l, BlockStatement* bs) {
+    // TODO
+}
+
+int parse_funcdecl(Lexer* l, FunctionDeclaration* fd) {
+    // TODO
+}
+
+// Parse function -- takes a lexer and produces a concrete syntax tree. Fill the
+// struct which we have given with the data.
+int make_cst(Lexer* l, ConcreteFileTree* tree) {
+    // TODO
+}
diff --git a/src/parser/parse.h b/src/parser/parse.h
@@ -0,0 +1,5 @@
+/* Parser
+ *
+ */
+
+int parse(const char *filename);
diff --git a/tests/simplemain.c b/tests/simplemain.c
@@ -0,0 +1,3 @@
+int main() {
+	return 5;
+}