diff --git a/.gitignore b/.gitignore index d163863..9a9082b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -build/ \ No newline at end of file +build/ +shell.nix \ No newline at end of file diff --git a/src/lexer/lex.c b/src/lexer/lex.c index 62bd177..0fd7146 100644 --- a/src/lexer/lex.c +++ b/src/lexer/lex.c @@ -1,4 +1,7 @@ #include "lex.h" +#include "token.h" +#include +#include #include // tassert #include @@ -83,6 +86,30 @@ int is_valid_numeric_or_id_char(char c) { return isalnum(c) || (c == '_') || (c == '.'); } +int lexer_getchar(Lexer* l) { + l->position++; + l->last_column = l->column; + l->buffer[0] = getc(l->fp); + if (l->buffer[0] == '\n') { + l->line++; + l->column = 0; + } else { + l->column++; + } + return l->buffer[0]; +} + +int lexer_ungetchar(Lexer *l) { + assert(l->position >= 0); + l->position--; + l->column = l->last_column; + if (l->buffer[0] == '\n') { + l->line--; + } + ungetc(l->buffer[0], l->fp); + return 1; +} + int real_lex(Lexer*, Token*); /** @@ -120,17 +147,22 @@ int real_lex(Lexer *l, Token *t) { skip_to_token(l); // Get initial character - int init = getc(l->fp); + int init = lexer_getchar(l); // Clear memory and initialize memset(t->contents, 0, TOKEN_LENGTH); + // Set sourcefile + memcpy(t->source_file, &l->current_file, TOKEN_LENGTH); + // First important check -- have we reached the end of the file? static char eof[] = "[end of file]"; if (init == EOF) { strcpy(t->contents, eof); t->length = strlen(eof); t->type = TT_EOF; + t->line = l->line; + t->column = l->column; return 0; } @@ -147,6 +179,8 @@ int real_lex(Lexer *l, Token *t) { strcpy(t->contents, nline); t->length = strlen(nline); t->type = TT_NEWLINE; + t->line = l->line; + t->column = l->column; return 0; } @@ -170,6 +204,8 @@ int real_lex(Lexer *l, Token *t) { if (in_string(init, single_char_tokens)) { t->length = pos; t->type = ttype_one_char(init); + t->line = l->line; + t->column = l->column; return 0; } @@ -177,9 +213,13 @@ int real_lex(Lexer *l, Token *t) { // If it starts with an alphanumeric character or an underscore, search // until we hit something which isn't. int c; + int starting_line; + int starting_col; if (is_valid_numeric_or_id_char(init)) { + starting_line = l->line; + starting_col = l->column; for (;;) { - c = getc(l->fp); + c = lexer_getchar(l); // If not alphanumeric or underscore, skip to end if (!is_valid_numeric_or_id_char(c)) break; @@ -194,10 +234,12 @@ int real_lex(Lexer *l, Token *t) { t->contents[pos++] = c; } // We've ended! - ungetc(c, l->fp); + lexer_ungetchar(l); t->contents[pos] = '\0'; t->type = ttype_many_chars(t->contents); t->length = pos; + t->line = starting_line; + t->column = starting_col; return 0; } @@ -219,6 +261,7 @@ int real_lex(Lexer *l, Token *t) { // TODO - parse character or string literal + PRINT_ERROR("lexer unable to identify token starting with: %c", init); return 0; } @@ -239,10 +282,10 @@ int skip_to_token(Lexer *l) { int in_block = 0, pass = 0; // Read the first character - if ((cur = fgetc(l->fp)) != EOF) { + if ((cur = lexer_getchar(l)) != EOF) { prev = cur; if (!(cur == ' ' || cur == '\t' || cur == '/')) { - fseek(l->fp, -1, SEEK_CUR); + lexer_ungetchar(l); return 0; // Token begins immediately } } else { @@ -250,7 +293,7 @@ int skip_to_token(Lexer *l) { } // Read each character from the file until EOF - while ((cur = fgetc(l->fp)) != EOF) { + while ((cur = lexer_getchar(l)) != EOF) { if (cur == '/' && prev == '/' && in_block == 0) { in_block = 1; // Single line comment } else if (cur == '*' && prev == '/' && in_block == 0) { @@ -261,12 +304,11 @@ int skip_to_token(Lexer *l) { in_block = 0; // Out of comment } else if (prev == '/' && !(cur == '*' || cur == '/') && in_block == 0) { - fseek(l->fp, -1, SEEK_CUR); + lexer_ungetchar(l); return 0; // Token was a slash without a * or / following it } - if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) { - fseek(l->fp, -1, SEEK_CUR); + lexer_ungetchar(l); return 0; // Token is next } diff --git a/src/lexer/lex.h b/src/lexer/lex.h index 8ed0489..9f63a5b 100644 --- a/src/lexer/lex.h +++ b/src/lexer/lex.h @@ -11,6 +11,12 @@ // the state of a lexer. typedef struct { FILE *fp; // The file we are reading from. + char current_file[TOKEN_LENGTH]; // The name of source file we are reading from. + char buffer[1]; // A buffer so that chars can be "put back" + long position; // The posistion of the file pointer in the current file in characters from the start + int last_column; + int column; // The number of characters down whichever line its on + int line; // The number of lines it has passed so far Token unlexed[TOKEN_PUTBACKS]; unsigned unlexed_count; } Lexer; @@ -19,6 +25,13 @@ typedef struct { // with the next available token from the file. int lex(Lexer *l, Token *token); +// Wrapper for getc. Takes a lexer pointer and returns the next character in the file its holding onto. +// Updates position and stream +int lexer_getchar(Lexer *l); + +// Wrapper for ungetc. Takes a lexer pointer and back-tracks 1 character using the lexer buffer. Updates position +int lexer_ungetchar(Lexer *l); + // Put a token back to be lexed again in the future. int unlex(Lexer *l, Token *token); diff --git a/src/lexer/token.h b/src/lexer/token.h index 4713561..4615375 100644 --- a/src/lexer/token.h +++ b/src/lexer/token.h @@ -100,7 +100,10 @@ typedef enum { #define TOKEN_LENGTH 256 typedef struct { - TokenType type; // What type of token this is. - char contents[TOKEN_LENGTH]; // The actual contents of the token. - unsigned length; // How long the token is. + TokenType type; // What type of token this is. + char contents[TOKEN_LENGTH]; // The actual contents of the token. + unsigned length; // How long the token is. + char source_file[TOKEN_LENGTH]; // The source file the token was in. + int line; // which line in the file the token was found + int column; // Where in that line the token was found } Token;