jabacat · adamhutchings · Jun 26, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-build/
+build/
+shell.nix
diff --git a/src/lexer/lex.c b/src/lexer/lex.c
@@ -1,4 +1,7 @@
 #include "lex.h"
+#include "token.h"
+#include <assert.h>
+#include <stdio.h>
 #include <testing/tassert.h> // tassert
 #include <testing/test_utils.h>
 
@@ -83,6 +86,30 @@ int is_valid_numeric_or_id_char(char c) {
     return isalnum(c) || (c == '_') || (c == '.');
 }
 
+int lexer_getchar(Lexer* l) {
+    l->position++;
+    l->last_column = l->column;
+    l->buffer[0] = getc(l->fp);
+    if (l->buffer[0] == '\n') {
+        l->line++;
+        l->column = 0;
+    } else {
+        l->column++;
+    }
+    return l->buffer[0];
+}
+
+int lexer_ungetchar(Lexer *l) {
+    assert(l->position >= 0);
+    l->position--;
+    l->column = l->last_column;
+    if (l->buffer[0] == '\n') {
+        l->line--;
+    }
+    ungetc(l->buffer[0], l->fp);
+    return 1;
+}
+
 int real_lex(Lexer*, Token*);
 
 /**
@@ -120,17 +147,22 @@ int real_lex(Lexer *l, Token *t) {
 
     skip_to_token(l);
     // Get initial character
-    int init = getc(l->fp);
+    int init = lexer_getchar(l);
 
     // Clear memory and initialize
     memset(t->contents, 0, TOKEN_LENGTH);
 
+    // Set sourcefile
+    memcpy(t->source_file, &l->current_file, TOKEN_LENGTH);
+
     // First important check -- have we reached the end of the file?
     static char eof[] = "[end of file]";
     if (init == EOF) {
         strcpy(t->contents, eof);
         t->length = strlen(eof);
         t->type = TT_EOF;
+        t->line = l->line;
+        t->column = l->column;
         return 0;
     }
 
@@ -147,6 +179,8 @@ int real_lex(Lexer *l, Token *t) {
         strcpy(t->contents, nline);
         t->length = strlen(nline);
         t->type = TT_NEWLINE;
+        t->line = l->line;
+        t->column = l->column;
         return 0;
     }
 
@@ -170,16 +204,22 @@ int real_lex(Lexer *l, Token *t) {
     if (in_string(init, single_char_tokens)) {
         t->length = pos;
         t->type = ttype_one_char(init);
+        t->line = l->line;
+        t->column = l->column;
         return 0;
     }
 
     // LEXING NUMERIC LITERAL OR IDENTIFIER
     // If it starts with an alphanumeric character or an underscore, search
     // until we hit something which isn't.
     int c;
+    int starting_line;
+    int starting_col;
     if (is_valid_numeric_or_id_char(init)) {
+        starting_line = l->line;
+        starting_col = l->column;
         for (;;) {
-            c = getc(l->fp);
+            c = lexer_getchar(l);
             // If not alphanumeric or underscore, skip to end
             if (!is_valid_numeric_or_id_char(c))
                 break;
@@ -194,10 +234,12 @@ int real_lex(Lexer *l, Token *t) {
             t->contents[pos++] = c;
         }
         // We've ended!
-        ungetc(c, l->fp);
+        lexer_ungetchar(l);
         t->contents[pos] = '\0';
         t->type = ttype_many_chars(t->contents);
         t->length = pos;
+        t->line = starting_line;
+        t->column = starting_col;
         return 0;
     }
 
@@ -219,6 +261,7 @@ int real_lex(Lexer *l, Token *t) {
 
     // TODO - parse character or string literal
 
+    PRINT_ERROR("lexer unable to identify token starting with: %c", init);
     return 0;
 }
 
@@ -239,18 +282,18 @@ int skip_to_token(Lexer *l) {
     int in_block = 0, pass = 0;
 
     // Read the first character
-    if ((cur = fgetc(l->fp)) != EOF) {
+    if ((cur = lexer_getchar(l)) != EOF) {
         prev = cur;
         if (!(cur == ' ' || cur == '\t' || cur == '/')) {
-            fseek(l->fp, -1, SEEK_CUR);
+            lexer_ungetchar(l);
             return 0; // Token begins immediately
         }
     } else {
         return -1; // File done, no more tokens
     }
 
     // Read each character from the file until EOF
-    while ((cur = fgetc(l->fp)) != EOF) {
+    while ((cur = lexer_getchar(l)) != EOF) {
         if (cur == '/' && prev == '/' && in_block == 0) {
             in_block = 1; // Single line comment
         } else if (cur == '*' && prev == '/' && in_block == 0) {
@@ -261,12 +304,11 @@ int skip_to_token(Lexer *l) {
             in_block = 0; // Out of comment
         } else if (prev == '/' && !(cur == '*' || cur == '/') &&
                    in_block == 0) {
-            fseek(l->fp, -1, SEEK_CUR);
+            lexer_ungetchar(l);
             return 0; // Token was a slash without a * or / following it
         }
-
         if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) {
-            fseek(l->fp, -1, SEEK_CUR);
+            lexer_ungetchar(l);
             return 0; // Token is next
         }
 

diff --git a/src/lexer/lex.h b/src/lexer/lex.h
@@ -11,6 +11,12 @@
 // the state of a lexer.
 typedef struct {
     FILE *fp; // The file we are reading from.
+    char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
+    char buffer[1]; // A buffer so that chars can be "put back"
+    long position; // The posistion of the file pointer in the current file in characters from the start
+    int last_column;
+    int column; // The number of characters down whichever line its on
+    int line; // The number of lines it has passed so far
     Token unlexed[TOKEN_PUTBACKS];
     unsigned unlexed_count;
 } Lexer;
@@ -19,6 +25,13 @@ typedef struct {
 // with the next available token from the file.
 int lex(Lexer *l, Token *token);
 
+// Wrapper for getc. Takes a lexer pointer and returns the next character in the file its holding onto.
+// Updates position and stream
+int lexer_getchar(Lexer *l);
+
+// Wrapper for ungetc. Takes a lexer pointer and back-tracks 1 character using the lexer buffer. Updates position
+int lexer_ungetchar(Lexer *l);
+
 // Put a token back to be lexed again in the future.
 int unlex(Lexer *l, Token *token);
 

diff --git a/src/lexer/token.h b/src/lexer/token.h
@@ -100,7 +100,10 @@ typedef enum {
 #define TOKEN_LENGTH 256
 
 typedef struct {
-    TokenType type;              // What type of token this is.
-    char contents[TOKEN_LENGTH]; // The actual contents of the token.
-    unsigned length;             // How long the token is.
+    TokenType type;                 // What type of token this is.
+    char contents[TOKEN_LENGTH];    // The actual contents of the token.
+    unsigned length;                // How long the token is.
+    char source_file[TOKEN_LENGTH]; // The source file the token was in.
+    int line;                       // which line in the file the token was found
+    int column;                     // Where in that line the token was found
 } Token;