-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cpp
102 lines (96 loc) · 3.78 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#include <iostream>
#include <cstring>
#include "util.h"
#include "grams.h"
using namespace std;
/**
* @brief train the model.
*
* @param n hyperparameter `n`
* @param filename file name of the dataset file
*/
void train(const int& n, const string& filename) {
record_words(filename, words); // record all words
cout << "done with record_words" << endl;
// padding the words vector with word `<unk>`
for (auto i = 0; i < n - 1; i++) {
words.insert(words.begin(), "<unk>");
words.push_back("<unk>");
}
record_freq(words, freq); // record the frequency of each word
cout << "done with record_freq" << endl;
filter_words(freq, words); // filter the less frequent words by replacing them by `<unk>`
cout << "done with filter_words" << endl;
set_word_index(words, word_index); // store the index of the word in the matrix
cout << "done with set_word_index" << endl;
set_coocur_matrix(words, word_index, coocur_matrix, n); // store the co-occurrence matrix
cout << "done with set_coocur_matrix" << endl;
normalize_matrix(coocur_matrix, normalized_matrix); // normalize the matrix
// Now we have the normalized co-occurrence matrix, we can use it to do some cool stuff.
// We want to save the normalized matrix to a file, so that we can use it later.
save_matrix("matrix.txt", normalized_matrix, word_index);
return;
}
void test(const string& testfile, const string& matrix_file) {
restore_matrix(matrix_file, normalized_matrix, index_word);
// Now you should try to use the normalized matrix to find the most similar words.
// We believe that the more similar the words are, the more likely they have the same context.
// In this case, you can use this method to find the most similar words.
ifstream file(testfile); // open the test file
if (!file.good()) {
cerr << "Error: cannot open file " << testfile << endl;
exit(1);
}
string word;
// first, we read the words from the test file. Each row of the test file is a word.
vector<string> test_words;
while (getline(file, word)) {
test_words.emplace_back(word);
}
file.close();
// Now, we shall find the 5 most similar words for each word in the test file.
ofstream outfile("output.txt");
if (!outfile.good()) {
cerr << "Error: cannot open file output.txt" << endl;
exit(1);
}
for (auto w : test_words) {
auto vec = most_similar(w, normalized_matrix, index_word);
for (auto item : vec) {
outfile << item << " ";
}
outfile << endl;
}
outfile.close();
return;
}
int main(int argc, const char** argv) {
if (strcmp(argv[1], "--train") != 0 && strcmp(argv[1], "--test") != 0) {
cout << "Invalid input" << endl;
cout << "Usage: ./n_gram --train <n> <input file>" << endl;
cout << " ./n_gram --test <test file> <matrix file>" << endl;
return 1;
}
if (strcmp(argv[1], "--train") == 0) {
if (argc != 4) {
cout << "Invalid input" << endl;
cout << "Usage: .n_/gram --train <n> <input file>" << endl;
return 1;
}
int n = stoi(argv[2]); // hyperparameter `n`
string filename = argv[3]; // the input file name
train(n, filename);
}
else {
if (argc != 4) {
cout << "Invalid input" << endl;
cout << "Usage: ./n_gram --test <test file> <matrix file>" << endl;
return 1;
}
string test_file = argv[2];
string matrix_file = argv[3];
test(test_file, matrix_file);
}
string str;
return 0;
}