Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add token tracking #34

Merged
merged 8 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
build/
build/
shell.nix
60 changes: 51 additions & 9 deletions src/lexer/lex.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "lex.h"
#include "token.h"
#include <assert.h>
#include <stdio.h>
#include <testing/tassert.h> // tassert
#include <testing/test_utils.h>

Expand Down Expand Up @@ -83,6 +86,30 @@ int is_valid_numeric_or_id_char(char c) {
return isalnum(c) || (c == '_') || (c == '.');
}

int lexer_getchar(Lexer* l) {
l->position++;
l->last_column = l->column;
l->buffer[0] = getc(l->fp);
if (l->buffer[0] == '\n') {
l->line++;
l->column = 0;
} else {
l->column++;
}
return l->buffer[0];
}

int lexer_ungetchar(Lexer *l) {
assert(l->position >= 0);
l->position--;
l->column = l->last_column;
if (l->buffer[0] == '\n') {
l->line--;
}
ungetc(l->buffer[0], l->fp);
return 1;
}

int real_lex(Lexer*, Token*);

/**
Expand Down Expand Up @@ -120,17 +147,22 @@ int real_lex(Lexer *l, Token *t) {

skip_to_token(l);
// Get initial character
int init = getc(l->fp);
int init = lexer_getchar(l);

// Clear memory and initialize
memset(t->contents, 0, TOKEN_LENGTH);

// Set sourcefile
memcpy(t->source_file, &l->current_file, TOKEN_LENGTH);

// First important check -- have we reached the end of the file?
static char eof[] = "[end of file]";
if (init == EOF) {
strcpy(t->contents, eof);
t->length = strlen(eof);
t->type = TT_EOF;
t->line = l->line;
t->column = l->column;
return 0;
}

Expand All @@ -147,6 +179,8 @@ int real_lex(Lexer *l, Token *t) {
strcpy(t->contents, nline);
t->length = strlen(nline);
t->type = TT_NEWLINE;
t->line = l->line;
t->column = l->column;
return 0;
}

Expand All @@ -170,16 +204,22 @@ int real_lex(Lexer *l, Token *t) {
if (in_string(init, single_char_tokens)) {
t->length = pos;
t->type = ttype_one_char(init);
t->line = l->line;
t->column = l->column;
return 0;
}

// LEXING NUMERIC LITERAL OR IDENTIFIER
// If it starts with an alphanumeric character or an underscore, search
// until we hit something which isn't.
int c;
int starting_line;
int starting_col;
if (is_valid_numeric_or_id_char(init)) {
starting_line = l->line;
starting_col = l->column;
for (;;) {
c = getc(l->fp);
c = lexer_getchar(l);
// If not alphanumeric or underscore, skip to end
if (!is_valid_numeric_or_id_char(c))
break;
Expand All @@ -194,10 +234,12 @@ int real_lex(Lexer *l, Token *t) {
t->contents[pos++] = c;
}
// We've ended!
ungetc(c, l->fp);
lexer_ungetchar(l);
t->contents[pos] = '\0';
t->type = ttype_many_chars(t->contents);
t->length = pos;
t->line = starting_line;
t->column = starting_col;
return 0;
}

Expand All @@ -219,6 +261,7 @@ int real_lex(Lexer *l, Token *t) {

// TODO - parse character or string literal

PRINT_ERROR("lexer unable to identify token starting with: %c", init);
return 0;
}

Expand All @@ -239,18 +282,18 @@ int skip_to_token(Lexer *l) {
int in_block = 0, pass = 0;

// Read the first character
if ((cur = fgetc(l->fp)) != EOF) {
if ((cur = lexer_getchar(l)) != EOF) {
prev = cur;
if (!(cur == ' ' || cur == '\t' || cur == '/')) {
fseek(l->fp, -1, SEEK_CUR);
lexer_ungetchar(l);
return 0; // Token begins immediately
}
} else {
return -1; // File done, no more tokens
}

// Read each character from the file until EOF
while ((cur = fgetc(l->fp)) != EOF) {
while ((cur = lexer_getchar(l)) != EOF) {
if (cur == '/' && prev == '/' && in_block == 0) {
in_block = 1; // Single line comment
} else if (cur == '*' && prev == '/' && in_block == 0) {
Expand All @@ -261,12 +304,11 @@ int skip_to_token(Lexer *l) {
in_block = 0; // Out of comment
} else if (prev == '/' && !(cur == '*' || cur == '/') &&
in_block == 0) {
fseek(l->fp, -1, SEEK_CUR);
lexer_ungetchar(l);
return 0; // Token was a slash without a * or / following it
}

if (!(cur == ' ' || cur == '\t' || cur == '/') && in_block == 0) {
fseek(l->fp, -1, SEEK_CUR);
lexer_ungetchar(l);
return 0; // Token is next
}

Expand Down
13 changes: 13 additions & 0 deletions src/lexer/lex.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
// the state of a lexer.
typedef struct {
FILE *fp; // The file we are reading from.
char current_file[TOKEN_LENGTH]; // The name of source file we are reading from.
char buffer[1]; // A buffer so that chars can be "put back"
long position; // The posistion of the file pointer in the current file in characters from the start
int last_column;
int column; // The number of characters down whichever line its on
int line; // The number of lines it has passed so far
Token unlexed[TOKEN_PUTBACKS];
unsigned unlexed_count;
} Lexer;
Expand All @@ -19,6 +25,13 @@ typedef struct {
// with the next available token from the file.
int lex(Lexer *l, Token *token);

// Wrapper for getc. Takes a lexer pointer and returns the next character in the file its holding onto.
// Updates position and stream
int lexer_getchar(Lexer *l);

// Wrapper for ungetc. Takes a lexer pointer and back-tracks 1 character using the lexer buffer. Updates position
int lexer_ungetchar(Lexer *l);

// Put a token back to be lexed again in the future.
int unlex(Lexer *l, Token *token);

Expand Down
9 changes: 6 additions & 3 deletions src/lexer/token.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ typedef enum {
#define TOKEN_LENGTH 256

typedef struct {
TokenType type; // What type of token this is.
char contents[TOKEN_LENGTH]; // The actual contents of the token.
unsigned length; // How long the token is.
TokenType type; // What type of token this is.
char contents[TOKEN_LENGTH]; // The actual contents of the token.
unsigned length; // How long the token is.
char source_file[TOKEN_LENGTH]; // The source file the token was in.
int line; // which line in the file the token was found
int column; // Where in that line the token was found
} Token;
Loading