-
Notifications
You must be signed in to change notification settings - Fork 3
/
lexical_analyzer_driver.cpp
160 lines (143 loc) · 8.09 KB
/
lexical_analyzer_driver.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include "lexical_analyzer_driver.h"
#include <algorithm>
bool LexicalAnalyzerDriver::IsInputOver() {
return this->input_over_;
}
LexicalAnalyzerDriver::LexicalAnalyzerDriver(DFAState *root_state_, const string &input_file_name_, const vector<RegularExpression> ®ular_expressions) :
root_state_(root_state_),
input_over_(false),
forward_iterator_fills_buffer_(true),
input_file_name_(input_file_name_),
buffers_((unsigned long) this->number_of_buffers_, vector<char>(this->buffer_size_ + 1, EOF)) {
input_file_.open(input_file_name_, ifstream::binary);
this->forward_ = this->buffers_[0].begin();
this->input_file_.read(this->buffers_[0].data(), this->buffer_size_);
this->active_buffer_ = 0;
//We get the read number of bytes. We don't need to set the last byte with 0 here as it's all initialized with zeros but below we should as data circulates
this->active_buffer_size_ = ((this->input_file_) ? this->buffer_size_ : this->input_file_.gcount());
//This char is dummy as it's skipped when returning the lexeme as the root state has no character transitioner
this->characters_states_.push(make_pair(this->dummy_initial_transition_char_, this->root_state_));
SetRegularExpressionsNames(regular_expressions);
}
//The caller has to check if no more input is present by calling IsInputOver method
//The caller is the parser.
//TAKE CARE this function assumes that a lexeme is strictly smaller than the size of a single buffer so make sure the buffer size is large enough
Token *LexicalAnalyzerDriver::GetNextToken() {
if(this->IsSkippableCharacter(*this->forward_)) this->IncreaseForwardPointer(true);
//start with setting lexeme_begin with forward in order to satrt a new lexeme
this->lexeme_begin_ = this->forward_;
while (true) {
// //Skippable characters are handled in the forward pointer transition functions but this check is for the first ever character
// if (this->IsSkippableCharacter(*this->forward_)) {
// this->IncreaseForwardPointer();
// }
//if dead state then pop until finding an accepting state else return error
//if the char now is 0 then the buffer isn't fully filled and that means the input is finished not at the end of the buffer
//forward is pointing to the next character
if (characters_states_.top().second->IsDeadState() || *this->forward_ == EOF || this->IsSkippableCharacter(*this->forward_)) {
for (int i = 0; i < this->characters_states_.size(); ++i) {
if (this->characters_states_.top().second->IsAcceptingState()) {
Token *token = this->GetTokenFromStatesStack();
//When encountering an acceptance state forward the forward pointer to skip any skippable character to check if the EOF is reached
if(this->IsSkippableCharacter(*this->forward_)) this->IncreaseForwardPointer(true);
if(*this->forward_ == EOF) this->input_over_ = true;
//Push the s tate after the root state onto the stack in order to be used in the next call of GetNextToken
this->characters_states_.push(make_pair(this->dummy_initial_transition_char_, this->root_state_));
return token;
}
this->characters_states_.pop();
//After calling decrease pointer if there's an error, then there's no accepting state as we reached lexeme_begin without finding one
if (!this->DecreaseForwardPointer()) return nullptr;
}
}
DFAState* neighbour= this->characters_states_.top().second->GetNeighbour(string{*this->forward_});
//return null if the current input is not available in the possible inputs specified in the regular expressions file
//take care that if the input is present in the input table then it would have to lead to a transition either to a normal state
//or to a dead state
if(neighbour == nullptr) return nullptr;
//Push the neighbour onto the stack transitioned wth the current character forward points to
this->characters_states_.push(make_pair(*this->forward_, neighbour));
this->IncreaseForwardPointer();
}
}
Token *LexicalAnalyzerDriver::GetTokenFromStatesStack() {
//Here the strings are created on the stack as they're copied into the constructor of token
string lexeme;
string token_name = dynamic_cast<DFAAcceptanceState*>(this->characters_states_.top().second)->get_token_name();
int charcaters_states_size_ = static_cast<int>(this->characters_states_.size());
for (int i = 0; i < charcaters_states_size_; ++i) {
if (this->characters_states_.top().second == this->root_state_) {
this->characters_states_.pop();
break;
}
//Append the character that transited the current DFAState
lexeme += this->characters_states_.top().first;
this->characters_states_.pop();
}
reverse(lexeme.begin(), lexeme.end());
if(token_name[0] == '\\') {
token_name.erase(0, 1);
}
return new Token(lexeme, token_name);
}
void LexicalAnalyzerDriver::IncreaseForwardPointer(bool skip_skippable_chars_recursively) {
//Reading the buffers. Fill next buffer if reached the end of the current buffer
//Take care that end points to the element one past the last element and the last element is the \0
if (this->forward_ == this->buffers_[this->active_buffer_].end() - 2) {
this->active_buffer_ = (this->active_buffer_ + 1) % this->number_of_buffers_;
if (this->forward_iterator_fills_buffer_) {
this->input_file_.read(this->buffers_[this->active_buffer_].data(), this->buffer_size_);
} else {
//If the next buffer is recently filled and we jumped backwards (using modulus) then don't fill it again to avoid overwriting and then set the
//flag to true for the next forward jump to go normally
this->forward_iterator_fills_buffer_ = true;
}
this->forward_ = this->buffers_[this->active_buffer_].begin();
this->input_over_ = !this->input_file_.good();
//If the input is over the last element read after it will have a EOF even if it's the begining of a buffer as gcount() returns 0 if noting is read
//this will make the forward iterator be pointing at a EOF
this->active_buffer_size_ = ((this->input_file_) ? this->buffer_size_ : this->input_file_.gcount());
//Set the last character in the buffer to EOF in order to indicate the end of the input
//Take care that this is needed asa this is a circular buffer.
//Each buffer is filled with initialized with buffer size + 1 that's why we add the EOF index active buffer size
this->buffers_[this->active_buffer_][this->active_buffer_size_] = static_cast<char>(EOF);
} else {
this->forward_++;
}
//recursively forward pointer untill reaching a non skippable character
if (skip_skippable_chars_recursively) {
while (IsSkippableCharacter(*this->forward_)) {
this->IncreaseForwardPointer();
}
}
}
bool LexicalAnalyzerDriver::DecreaseForwardPointer() {
if (this->forward_ == this->lexeme_begin_) return false;
//Jump to the previous buffer if at the beginning of the current buffer the mod handles the circulation in the buffer
if (this->forward_ == this->buffers_[this->active_buffer_].begin()) {
this->active_buffer_ = (this->active_buffer_ - 1) % this->number_of_buffers_;
//Take care that end points to the element one past the last element and the last element is the EOF
this->forward_ = this->buffers_[this->active_buffer_].end() - 2;
this->forward_iterator_fills_buffer_ = false;
} else this->forward_--;
if (IsSkippableCharacter(*this->forward_)) return false;
return true;
}
LexicalAnalyzerDriver::~LexicalAnalyzerDriver() {
this->input_file_.close();
}
bool LexicalAnalyzerDriver::IsSkippableCharacter(char c) const {
return c == '\n' || c == ' ' || c == '\t';
}
set<string> LexicalAnalyzerDriver::GetRegularExpressionsNames() {
return regular_expressions_names_;
}
void LexicalAnalyzerDriver::SetRegularExpressionsNames(const vector<RegularExpression> ®ular_expressions) {
for (auto&& regular_expression : regular_expressions){
string regular_expression_name = regular_expression.getName();
if(regular_expression_name[0] == '\\'){
regular_expression_name.erase(0,1);
}
regular_expressions_names_.insert(regular_expression_name);
}
}