-
Notifications
You must be signed in to change notification settings - Fork 0
/
LDLmodel.jl
173 lines (156 loc) · 8.39 KB
/
LDLmodel.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
using CSV, DataFrames
using JudiLing
using JudiLingMeasures
include("utils.jl")
include("repository.jl")
include("output.jl")
mutable struct LDLmodel
S::Union{Missing,Matrix,JudiLing.SparseMatrixCSC}
cue_obj::Union{Missing,JudiLing.Cue_Matrix_Struct}
F::Union{Missing,Matrix,JudiLing.SparseMatrixCSC}
G::Union{Missing,Matrix,JudiLing.SparseMatrixCSC}
Shat::Union{Missing,Matrix,JudiLing.SparseMatrixCSC}
Chat::Union{Missing,Matrix,JudiLing.SparseMatrixCSC}
res_learn::Union{Missing,Array{Array{JudiLing.Result_Path_Info_Struct,1},1}}
gpi_learn::Union{Missing,Vector{JudiLing.Gold_Path_Info_Struct}}
rpi_learn::Union{Missing,Vector{JudiLing.Gold_Path_Info_Struct}}
measures::Union{Missing,DataFrame}
dataset::Union{Missing, DataFrame}
function LDLmodel()
new(missing, missing, missing, missing, missing, missing, missing, missing, missing, missing, missing)
end
end
function get_dataset_be(ldl_model, file, dataname)
if ismissing(file)
dataset = repository_get(dataname)
else
dataname = filename(filespayload(file))
temporary_repository_add(file, dataname)
dataset = temporary_repository_get(dataname)
end
ldl_model.dataset = dataset
dataname
end
function set_semantic_vectors_be(ldl_model, file,
base_columns_selected,
inflectional_columns_selected,
ncol,
current_semantic_vectors,
wordform_col)
if ismissing(file)
ldl_model.S = JudiLing.make_S_matrix(ldl_model.dataset,
base_columns_selected,
inflectional_columns_selected,
ncol=ncol)
"simulated", false, missing
else
current_semantic_vectors = filename(filespayload(file))
temporary_repository_add(file, current_semantic_vectors)
semvecs = temporary_repository_get(current_semantic_vectors)
my_ldl_model.S, words = get_vectors_duplicates(semvecs,
ldl_model.dataset[:,wordform_col]);
too_big, excluded, ldl_model.dataset = check_semantic_vectors_be(words, my_ldl_model,
wordform_col)
current_semantic_vectors, too_big, excluded
end
end
function check_semantic_vectors_be(words, ldl_model, wordform_col)
if !all(words == ldl_model.dataset[:,wordform_col])
prev_length = size(ldl_model.dataset, 1)
ldl_model.dataset = filter(row -> row[wordform_col] in words, ldl_model.dataset)
#@info all(ldl_model.dataset[:,wordform_col] .== words)
true, prev_length - size(ldl_model.dataset,1), ldl_model.dataset
else
false, 0, ldl_model.dataset
end
end
function acc_calculable(ldl_model)
if (!ismissing(ldl_model.cue_obj) & !ismissing(ldl_model.S))
true
else
false
end
end
function calculate_comprehension_accuracy_be(ldl_model, target_column_selected)
if acc_calculable(ldl_model)
ldl_model.F = JudiLing.make_transform_matrix(ldl_model.cue_obj.C, ldl_model.S)
ldl_model.Shat = ldl_model.cue_obj.C * ldl_model.F
acc = JudiLing.eval_SC(ldl_model.Shat, ldl_model.S, ldl_model.dataset, target_column_selected)
acc
else
missing
end
end
function calculate_production_accuracy_be(ldl_model, target_column_selected)
if acc_calculable(ldl_model)
ldl_model.G = JudiLing.make_transform_matrix(ldl_model.S, ldl_model.cue_obj.C)
ldl_model.Chat = ldl_model.S * ldl_model.G
acc = JudiLing.eval_SC(ldl_model.Chat, ldl_model.cue_obj.C, ldl_model.dataset, target_column_selected)
else
missing
end
end
function calculate_algo_production_accuracy_be(ldl_model,
target_column_selected,
production_threshold,
ngram)
if ismissing(ldl_model.Chat)
ldl_model.G = JudiLing.make_transform_matrix(ldl_model.S, ldl_model.cue_obj.C)
ldl_model.Chat = ldl_model.S * ldl_model.G
end
if ismissing(ldl_model.F)
ldl_model.F = JudiLing.make_transform_matrix(ldl_model.cue_obj.C, ldl_model.S)
ldl_model.Shat = ldl_model.cue_obj.C * ldl_model.F
end
max_t = JudiLing.cal_max_timestep(ldl_model.dataset, ldl_model.dataset, target_column_selected)
ldl_model.res_learn, ldl_model.gpi_learn, ldl_model.rpi_learn = JudiLingMeasures.learn_paths_rpi(
ldl_model.dataset,
ldl_model.dataset,
ldl_model.cue_obj.C,
ldl_model.S,
ldl_model.F,
ldl_model.Chat,
ldl_model.cue_obj.A,
ldl_model.cue_obj.i2f,
ldl_model.cue_obj.f2i, # api changed in 0.3.1
gold_ind = ldl_model.cue_obj.gold_ind,
Shat_val = ldl_model.Shat,
check_gold_path = true,
max_t = max_t,
max_can = 10,
grams = ngram,
threshold = production_threshold,
tokenized = false,
sep_token = "_",
keep_sep = false,
target_col = target_column_selected,
issparse = :dense,
verbose = true,
);
acc = JudiLing.eval_acc(ldl_model.res_learn, ldl_model.cue_obj)
acc
end
function calculate_measures_be(ldl_model)
if ismissing(ldl_model.res_learn)
calculate_algo_production_accuracy()
end
ldl_model.measures = JudiLingMeasures.compute_all_measures(ldl_model.dataset, # the data of interest
ldl_model.cue_obj, # the cue_obj of the training data
ldl_model.cue_obj, # the cue_obj of the data of interest
ldl_model.Chat, # the Chat of the data of interest
ldl_model.S, # the S matrix of the data of interest
ldl_model.Shat, # the Shat matrix of the data of interest
ldl_model.F,
ldl_model.G,
ldl_model.res_learn, # the output of learn_paths for the data of interest
ldl_model.gpi_learn, # the gpi_learn object of the data of interest
ldl_model.rpi_learn); # the rpi_learn object of the data of interest
CSV.write("public/measures.csv", ldl_model.measures)
end
function get_number_of_cues(ldl_model)
if ismissing(ldl_model.cue_obj)
""
else
string("There are ", size(ldl_model.cue_obj.C, 2), " cues in your C matrix.")
end
end