forked from anmoisio/morphogen-dbca
-
Notifications
You must be signed in to change notification settings - Fork 0
/
divide.py
729 lines (640 loc) · 33.6 KB
/
divide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
"""
Divide a corpus into training and test sets so that the atom and compound divergences
are set to desired values. Before this script, the data has been converted to atom and
compound frequency matrices.
"""
from os import path, makedirs
import argparse
import random
import sys
import pickle as pkl
from tqdm import tqdm
import torch
TRAIN_SET = 0
TEST_SET = 1
DISCARD_SET = 2
SET_NAMES = {TRAIN_SET: 'TRAIN_SET', TEST_SET: 'TEST_SET', DISCARD_SET: 'DISCARD_SET'}
MASK_VALUE = -torch.inf
if torch.cuda.is_available():
print("Using", torch.cuda.device_count(), "GPU(s).")
device = torch.device("cuda")
if torch.cuda.device_count() == 1:
device = torch.device("cuda")
elif torch.cuda.device_count() == 2:
device = torch.device("cuda:1")
else:
raise ValueError("Using more than 2 GPUs is not supported.")
else:
print("Using only CPU.")
device = torch.device("cpu")
secondary_device = torch.device("cpu")
def chernoff_coef(matrix1, matrix2, alpha):
"""
The Chernoff coefficient c is a similarity measure C_{alpha}(P||Q)
= sum_k[p_k^alpha * q_k^(1-alpha)] e[0,1] between two probability
distributions P and Q. The alpha parameter determines if we want to
measure whether Q includes elements that are not in P.
(Atom divergence is 1 - Chernoff coefficient, with alpha=0.5)
(Compound divergence is 1 - Chernoff coefficient, with alpha=0.1)
Returns
-------
torch.Tensor, vector with length matrix.shape[0]
Chernoff coefficient between vector and each row of matrix
"""
if len(matrix1.shape) == 1:
sum_axis = 0
elif len(matrix1.shape) == 2:
sum_axis = 1
if len(matrix2.shape) == 1:
if matrix2.shape[0] != matrix1.shape[1]:
raise ValueError("matrix second dim must be equal to vector length")
else:
raise ValueError("matrix must be 1D or 2D")
if alpha < 0 or alpha > 1:
raise ValueError("alpha must be in [0,1]")
return torch.sum(torch.exp((torch.log(matrix1) * alpha) +
(torch.log(matrix2) * (1-alpha))), axis=sum_axis)
def mat_vec_divergence(matrix, vector, alpha):
"""Divergence between a vector and each row of a matrix."""
return 1 - chernoff_coef(normalize_matrix(matrix), normalize_vector(vector), alpha)
def mat_mat_divergence(matrix1, matrix2, alpha):
"""Divergence between each row of matrix1 and the corresponding row of matrix2."""
return 1 - chernoff_coef(normalize_matrix(matrix1), normalize_matrix(matrix2), alpha)
def get_scores(atomdivs, comdivs, target_atom_div, target_com_div):
"""Return the scores for each index in atomdivs and comdivs vectors."""
return - torch.abs(comdivs - target_com_div) \
- torch.abs(atomdivs - target_atom_div)
def remove_row(matrix, row):
"""Remove a row from a matrix (2D tensor) or vector (1D tensor)."""
return matrix[torch.arange(matrix.shape[0], device=device) != row]
def delete_value(vector, value):
"""Delete element from tensor by value. If there are multiple values,
the first one is deleted. (Vectors should not have duplicates, though.)"""
idx = (vector == value).nonzero(as_tuple=True)[0][0]
return remove_row(vector, idx)
def normalize_vector(vector):
"""Normalize a vector to have sum 1."""
return torch.nan_to_num(torch.divide(vector, torch.sum(vector)))
def normalize_matrix(matrix):
"""Normalize a matrix; each row sums up to 1."""
return torch.nan_to_num(
torch.transpose(
torch.divide(
torch.transpose(matrix, 0, 1),
torch.sum(matrix, axis=1)
),
0, 1) # /transpose
)
def load_struct(filename):
"""Load a structure from a file. Supported file extensions: pkl, txt."""
if filename.endswith('.pkl'):
with open(filename, 'rb') as pklf:
struct = pkl.load(pklf)
elif filename.endswith('.txt'):
with open(filename, 'r', encoding='utf-8') as txtf:
struct = [c.strip() for c in txtf.readlines()]
else:
sys.exit('Unknown file extension. Supported: pkl, txt.')
return struct
def cp_file_names(data_dir, i, atomdiv, comdiv) -> tuple[str, str]:
"""Return the names of the train and test sets for the given parameters."""
suf = f'_iter{i}_comdiv{comdiv}_atomdiv{atomdiv}'
train_set_out = data_dir + f'/train_set{suf}.txt'
test_set_out = data_dir + f'/test_set{suf}.txt'
return train_set_out, test_set_out
def get_candidates(current_freq_sums_dict, subset, the_other_set, changes) -> dict:
"""Returns a dictionary containing the candidate changes to the compound and
atom frequency distributions of the train and test sets."""
return {subset: current_freq_sums_dict[subset] + changes,
the_other_set: current_freq_sums_dict[the_other_set] - changes}
class DivideTrainTest:
"""Divide sample set to train and test sets. This class is inherited by other classes
that define the actual algorithm in the divide_corpus() method."""
def __init__(self,
data_dir=".",
group_size=1,
subsample_size=None,
subsample_iter=None,
presplit_train_test=None,
):
self.group_size = group_size
self._read_data(data_dir)
self.n_samples = len(self.sent_ids)
print(f'Size of corpus: {self.n_samples} rows.')
if subsample_size is not None:
if subsample_size < subsample_iter:
raise ValueError('subsample_size smaller than subsample_iter!')
self.do_subsample = True
self.subsample_size = subsample_size
if subsample_iter is None:
raise ValueError('subsample_size defined but subsample_iter not defined!')
self.subsample_iter = subsample_iter
else:
self.do_subsample = False
if subsample_iter is not None:
raise ValueError('subsample_iter defined but subsample_size not defined!')
if self.do_subsample:
if self.n_samples < self.subsample_size:
raise ValueError('subsample_size is larger than the number of sentences!')
self.n_matrix_rows = int(self.subsample_size)
self.random_idxs = torch.zeros(self.subsample_size, device=device)
else:
self.n_matrix_rows = int(self.n_samples)
# sizes of matrices
self.com_dim = self.com_freq_matrix_full.shape[1]
self.atom_dim = self.atom_freq_matrix_full.shape[1]
# 2 matrices: one for each set
self.candidate_com_sums = torch.zeros((2, self.n_matrix_rows, self.com_dim), device=device)
self.candidate_atom_sums = torch.zeros((2, self.n_matrix_rows, self.atom_dim), device=device)
# keep unnormalised vectors as separate variables to enable updating
self.subset_com_freq_sum = torch.zeros((2, self.com_dim), device=device)
self.subset_atom_freq_sum = torch.zeros((2, self.atom_dim), device=device)
# init subset indices in the discard set
self.subset_indices = torch.zeros(self.n_samples, device=device) + DISCARD_SET
# TODO: lose the matrices on CPU and use only the GPU ones
# move the full matrices to the GPU (if available)
self.com_freq_matrix = self.com_freq_matrix_full.clone().detach().to(device)
self.atom_freq_matrix = self.atom_freq_matrix_full.clone().detach().to(device)
self.presplit_train_test = presplit_train_test
if self.presplit_train_test:
self._read_presplit_files(
load_struct(path.join(self.presplit_train_test, 'train.txt')),
load_struct(path.join(self.presplit_train_test, 'test.txt')))
def _read_presplit_files(self, train_set, test_set) -> None:
"""Read the pre-split train and test sets from files."""
print('Reading pre-split train and test sets...')
train_set_idxs = [self.sent_ids_inv[sent] for sent in train_set]
test_set_idxs = [self.sent_ids_inv[sent] for sent in test_set]
for idx in train_set_idxs:
self.subset_indices[idx] = TRAIN_SET
for idx in test_set_idxs:
self.subset_indices[idx] = TEST_SET
train_set = torch.tensor(train_set_idxs, device=device)
test_set = torch.tensor(test_set_idxs, device=device)
if train_set.size()[0] == 0:
self.subset_com_freq_sum[TRAIN_SET] = torch.zeros(self.com_dim, device=device)
self.subset_atom_freq_sum[TRAIN_SET] = torch.zeros(self.atom_dim, device=device)
else:
self.subset_com_freq_sum[TRAIN_SET] = torch.zeros(self.com_dim, device=device)
self.subset_atom_freq_sum[TRAIN_SET] = torch.zeros(self.atom_dim, device=device)
for idx in tqdm(train_set):
self.subset_com_freq_sum[TRAIN_SET] += self.com_freq_matrix[idx].to_dense()
self.subset_atom_freq_sum[TRAIN_SET] += self.atom_freq_matrix[idx].to_dense()
if test_set.size()[0] == 0:
self.subset_com_freq_sum[TEST_SET] = torch.zeros(self.com_dim, device=device)
self.subset_atom_freq_sum[TEST_SET] = torch.zeros(self.atom_dim, device=device)
else:
self.subset_com_freq_sum[TEST_SET] = torch.zeros(self.com_dim, device=device)
self.subset_atom_freq_sum[TEST_SET] = torch.zeros(self.atom_dim, device=device)
for idx in tqdm(test_set):
self.subset_com_freq_sum[TEST_SET] += self.com_freq_matrix[idx].to_dense()
self.subset_atom_freq_sum[TEST_SET] += self.atom_freq_matrix[idx].to_dense()
print('Done reading pre-split train and test sets.')
def _read_data(self, data_dir: str) -> None:
group_suffix = '' if self.group_size == 1 else f'_group{self.group_size}'
print('Reading data from files...')
self.atom_freq_matrix_full = torch.load(
path.join(data_dir, f'atom_freqs{group_suffix}.pt'),
map_location=secondary_device
)
self.com_freq_matrix_full = torch.load(
path.join(data_dir, f'compound_freqs{group_suffix}.pt'),
map_location=secondary_device
)
self.atom_ids = load_struct(path.join(data_dir, 'atom_ids.pkl'))
self.com_ids = load_struct(path.join(data_dir, 'com_ids.pkl'))
self.sent_ids = load_struct(path.join(data_dir, f'used_sent_ids{group_suffix}.txt'))
# check if sile path.join(data_dir, 'sent_sizes.txt') exists
if path.isfile(path.join(data_dir, 'sent_sizes.txt')):
self.sent_sizes = torch.tensor([int(x) for x in
load_struct(path.join(data_dir, 'sent_sizes.txt'))])
else:
self.sent_sizes = torch.ones(len(self.sent_ids))
self.sent_ids_inv = {sent_id: idx for idx, sent_id in enumerate(self.sent_ids)}
print('Done reading data.')
def write_ids_to_file(self, set_ids, set_output):
"""Write the ids of the sentences in the set to a file."""
if self.group_size == 1:
with open(set_output, 'w', encoding='utf-8') as f:
for sent_id in set_ids:
f.write(f'{sent_id}\n')
else:
with open(set_output, 'w', encoding='utf-8') as f:
for group in set_ids:
for sent_id in group:
f.write(f'{sent_id}\n')
def print_subset_atoms_and_compounds(self, print_all=False, separate=True):
"""Print the number of atoms and compounds in the train and test sets."""
if not separate:
print('\nATOMS; TRAIN SET, TEST SET:')
for atom, freq_train, freq_test in zip(self.atom_ids.keys(),
self.subset_atom_freq_sum[TRAIN_SET],
self.subset_atom_freq_sum[TEST_SET]):
if print_all or freq_train > 0 or freq_test > 0:
print(f'{atom}: {freq_train} {freq_test}')
print('\nCOMPOUNDS; TRAIN SET, TEST SET:')
for compound, freq_train, freq_test in zip(self.com_ids.keys(),
self.subset_com_freq_sum[TRAIN_SET],
self.subset_com_freq_sum[TEST_SET]):
if print_all or freq_train > 0 or freq_test > 0:
print(f'{compound}: {freq_train} {freq_test}')
else:
print('\nATOMS in TRAIN SET:')
print([atom for atom, freq in zip(self.atom_ids.keys(),
self.subset_atom_freq_sum[TRAIN_SET]) if freq > 0])
print('\nATOMS in TEST SET:')
print([atom for atom, freq in zip(self.atom_ids.keys(),
self.subset_atom_freq_sum[TEST_SET]) if freq > 0])
print('\nCOMS in TRAIN SET:')
print([com for com, freq in zip(self.com_ids.keys(),
self.subset_com_freq_sum[TRAIN_SET]) if freq > 0])
print('\nCOMS in TEST SET:')
print([com for com, freq in zip(self.com_ids.keys(),
self.subset_com_freq_sum[TEST_SET]) if freq > 0])
def divide_corpus(self):
"""Divide the corpus into train and test sets."""
raise NotImplementedError('This method must be implemented in a subclass.')
def get_divergences(self):
"""Returns the current atom and compound divergences."""
atomdiv = 1 - chernoff_coef(
normalize_vector(self.subset_atom_freq_sum[TRAIN_SET]),
normalize_vector(self.subset_atom_freq_sum[TEST_SET]), 0.5)
comdiv = 1 - chernoff_coef(
normalize_vector(self.subset_com_freq_sum[TRAIN_SET]),
normalize_vector(self.subset_com_freq_sum[TEST_SET]), 0.1)
return atomdiv, comdiv
def get_subset_indices(self, subset):
"""Returns the indices of the given subset."""
return (self.subset_indices == subset).nonzero()
def get_subset_sizes(self):
"""Return the sizes of the train and test sets."""
# train_size = self.get_subset_indices(TRAIN_SET).size()[0]
# test_size = self.get_subset_indices(TEST_SET).size()[0]
train_size = self.sent_sizes[self.get_subset_indices(TRAIN_SET)].sum()
test_size = self.sent_sizes[self.get_subset_indices(TEST_SET)].sum()
return train_size, test_size
class FromEmptySets(DivideTrainTest):
"""Divide sample set to train and test sets, starting from empty sets."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
print('Initialising the matrices...')
# used_ids_mask contains -inf for sentences that are used, 0 otherwise
self.used_ids_mask = torch.zeros(self.n_matrix_rows, device=device)
if self.presplit_train_test is None:
if self.do_subsample:
# initialise the random subsample indices
self.random_idxs, _, _ = self._get_random_subsample(DISCARD_SET)
else:
self._subsample(DISCARD_SET, [TRAIN_SET, TEST_SET])
print('Initialisation done.')
def _get_random_subsample(self, subset_id):
"""Take a random subsample of the sentences in the given subset."""
subset_indices = self.get_subset_indices(subset_id).squeeze()
random_subsample_indices = subset_indices[
torch.randperm(subset_indices.shape[0], device=device)][:self.subsample_size]
random_atom = self.atom_freq_matrix.index_select(0, random_subsample_indices).to_dense()
random_com = self.com_freq_matrix.index_select(0, random_subsample_indices).to_dense()
return random_subsample_indices, random_atom, random_com
def _subsample(self, from_set, to_sets):
"""Take a random subsample of sentences in from_set, of size self.subsample_size.
Update candidate_com_sums and candidate_atom_sums with the new subsample."""
self.random_idxs, random_atom, random_com = self._get_random_subsample(from_set)
self.used_ids_mask = torch.zeros(self.subsample_size, device=device)
for to_set in to_sets:
# a new matrix that has the freq sums with each new sample and the new random subset
self.candidate_com_sums[to_set] = self.subset_com_freq_sum[to_set] + random_com
self.candidate_atom_sums[to_set] = self.subset_atom_freq_sum[to_set] + random_atom
def _best_sentence(self, atom_div_vec, com_div_vec, mask=None) -> dict:
"""Return the argmax and max score, and the compound and atom divergences of
the sample that maximises the score. Score is the linear combination of
the negated differences between the target divergences and actual divergences.
Optionally, a mask can be applied to the score vector, e.g. to avoid selecting
sentences that have already been selected."""
scores = get_scores(atom_div_vec, com_div_vec, self.target_atom_div, self.target_com_div)
if mask is not None:
scores += mask
best_idx = torch.argmax(scores)
return {'idx': best_idx,
'score': scores[best_idx],
'atomdiv': atom_div_vec[best_idx],
'comdiv': com_div_vec[best_idx]}
def _add_sample_to_set(self, subset, selected_idx, from_set=None):
"""Update the data structs after a new sample has been selected to a subset."""
# mark the selected index as used
self.used_ids_mask[selected_idx] = MASK_VALUE
# if we are using a subsample, we need to map the selected index to the original
if self.do_subsample:
selected_idx = self.random_idxs[selected_idx]
self.subset_indices[selected_idx] = subset
selected_com_row = self.com_freq_matrix[selected_idx].to_dense()
selected_atom_row = self.atom_freq_matrix[selected_idx].to_dense()
self.subset_com_freq_sum[subset] += selected_com_row
self.subset_atom_freq_sum[subset] += selected_atom_row
if from_set is not None:
self.subset_com_freq_sum[from_set] -= selected_com_row
self.subset_atom_freq_sum[from_set] -= selected_atom_row
def _candidate_divergences(self, subset_id: int) -> tuple[torch.Tensor, torch.Tensor]:
"""Compute the compound divergence between the subset vector and the matrix containing
the candidate sentences."""
if subset_id == TRAIN_SET:
the_other_set_id = TEST_SET
com_alpha = 0.1
else:
the_other_set_id = TRAIN_SET
com_alpha = 0.9 # because test set is the first argument to chernoff_coef()
# The atom divergence is 1 - Chernoff coefficient, with alpha=0.5.
atom_div_add_to_subset = mat_vec_divergence(self.candidate_atom_sums[subset_id],
self.subset_atom_freq_sum[the_other_set_id], 0.5)
# The compound divergence is 1 - Chernoff coefficient, with alpha=0.1.
com_div_add_to_subset = mat_vec_divergence(self.candidate_com_sums[subset_id],
self.subset_com_freq_sum[the_other_set_id], com_alpha)
return atom_div_add_to_subset, com_div_add_to_subset
def _move_a_sample(self, from_set, to_set, randoms_atom, randoms_com):
"""Selects the sample that maximises the score when moved from one set to another,
and moves them."""
candidate_com_sums_from = self.subset_com_freq_sum[from_set] - randoms_com
candidate_com_sums_to = self.subset_com_freq_sum[to_set] + randoms_com
candidate_atom_sums_from = self.subset_atom_freq_sum[from_set] - randoms_atom
candidate_atom_sums_to = self.subset_atom_freq_sum[to_set] + randoms_atom
# 0.9 if test set is the first argument to chernoff_coef()
com_alpha = 0.1 if from_set == TRAIN_SET else 0.9
atom_divs = mat_mat_divergence(candidate_atom_sums_from, candidate_atom_sums_to, 0.5)
com_divs = mat_mat_divergence(candidate_com_sums_from, candidate_com_sums_to, com_alpha)
best_cand = self._best_sentence(atom_divs, com_divs)
if best_cand['score'] > get_scores(*self.get_divergences(), self.target_atom_div,
self.target_com_div):
self._add_sample_to_set(to_set, best_cand['idx'], from_set=from_set)
def divide_corpus(self,
target_atom_div=0.0,
target_com_div=1.0,
min_test_percent=0.05,
max_test_percent=0.3,
select_n_samples=None,
max_iters=None,
print_every=10000,
save_cp=100000,
output_dir=".",
move_a_sample_iter=-1,
):
"""Divide data into train and test sets. At each iteration, select the sample from
sample_matrix that maximises the score."""
self.target_atom_div = target_atom_div
self.target_com_div = target_com_div
remaining_size = self.get_subset_indices(DISCARD_SET).size()[0]
if select_n_samples is None or select_n_samples > remaining_size - self.subsample_size:
print('Warning: using all samples in corpus.')
# not using the last, incomplete subsample
# TODO: fix this to use all sentences
if self.do_subsample:
select_n_samples = remaining_size - self.subsample_size
else:
select_n_samples = remaining_size - 1
# reduce the intervals in proportion to the group size
if print_every > self.group_size:
print_every = print_every // self.group_size
if save_cp > self.group_size:
save_cp = save_cp // self.group_size
if max_iters and max_iters > self.group_size:
max_iters = max_iters // self.group_size
def _print_iteration():
print(f'After iteration {i+1}: Train set size {train_size}; ' \
+ f'Test set size {test_size}. ' \
+ f'Compound divergence {float(best_values[selected_set]["comdiv"])}; ' \
+ f'Atom divergence {float(best_values[selected_set]["atomdiv"])}')
def _save_division():
train_set_out, test_set_out = cp_file_names(output_dir, i,
float(best_values[selected_set]["atomdiv"]),
float(best_values[selected_set]["comdiv"]))
self.write_ids_to_file(
[self.sent_ids[ind] for ind in self.get_subset_indices(TRAIN_SET)], train_set_out)
self.write_ids_to_file(
[self.sent_ids[ind] for ind in self.get_subset_indices(TEST_SET)], test_set_out)
best_values = {}
train_size, test_size = self.get_subset_sizes()
if train_size == 0 and test_size == 0:
# initialize train set with one random sample
self._add_sample_to_set(TRAIN_SET, random.randrange(self.n_matrix_rows))
print('Starting from scratch, initialised the train set with one random sentence.')
print(
f'Starting division. Train set size: {int(train_size)}, ' + \
f'Test set size: {int(test_size)}. ' + \
f'Remaining samples to divide: {select_n_samples}.'
)
for i in tqdm(range(select_n_samples)):
train_size, test_size = self.get_subset_sizes()
test_percent = test_size / (train_size + test_size)
if test_percent > max_test_percent: # First check the size constraints
best_values[TRAIN_SET] = self._best_sentence(
*self._candidate_divergences(TRAIN_SET), mask=self.used_ids_mask)
selected_set = TRAIN_SET
elif test_percent < min_test_percent:
best_values[TEST_SET] = self._best_sentence(
*self._candidate_divergences(TEST_SET), mask=self.used_ids_mask)
selected_set = TEST_SET
else: # otherwise compare the max scores of the two sets
best_values[TRAIN_SET] = self._best_sentence(
*self._candidate_divergences(TRAIN_SET), mask=self.used_ids_mask)
best_values[TEST_SET] = self._best_sentence(
*self._candidate_divergences(TEST_SET), mask=self.used_ids_mask)
if best_values[TRAIN_SET]['score'] > best_values[TEST_SET]['score']:
selected_set = TRAIN_SET
else:
selected_set = TEST_SET
selected_idx = best_values[selected_set]['idx']
self._add_sample_to_set(selected_set, selected_idx)
if i % move_a_sample_iter == 0 and move_a_sample_iter > 0 and train_size > 1 and test_size > 1:
self.random_idxs, random_atom, random_com = self._get_random_subsample(TRAIN_SET)
self._move_a_sample(TRAIN_SET, TEST_SET, random_atom, random_com)
self.random_idxs, random_atom, random_com = self._get_random_subsample(TEST_SET)
self._move_a_sample(TEST_SET, TRAIN_SET, random_atom, random_com)
self._subsample(DISCARD_SET, [TRAIN_SET, TEST_SET])
elif self.do_subsample and i % self.subsample_iter == 0:
self._subsample(DISCARD_SET, [TRAIN_SET, TEST_SET])
else:
self.candidate_com_sums[selected_set] += \
self.com_freq_matrix[selected_idx].to_dense()
self.candidate_atom_sums[selected_set] += \
self.atom_freq_matrix[selected_idx].to_dense()
if i % print_every == 0:
_print_iteration()
if i % save_cp == 0:
_save_division()
if i == max_iters:
break
print('Division complete.')
_print_iteration()
_save_division()
def divide_corpus_with_fixed_train_set(self,
fixed_train_freq_sums,
target_atom_div=0.0,
target_com_div=1.0,
select_n_samples=None,
max_iters=None,
print_every=10000,
save_cp=100000,
output_dir=".",
):
"""Select a training set based on the given test set. At each iteration, select the sample
from sample_matrix that maximises the score, and add it to training set."""
self.subset_atom_freq_sum[TRAIN_SET] = torch.load(
path.join(fixed_train_freq_sums, 'atom_freq_sum_train.pt'),
map_location=device
)
self.subset_com_freq_sum[TRAIN_SET] = torch.load(
path.join(fixed_train_freq_sums, 'com_freq_sum_train.pt'),
map_location=device
)
self.target_atom_div = target_atom_div
self.target_com_div = target_com_div
remaining_size = self.get_subset_indices(DISCARD_SET).size()[0]
if select_n_samples is None or select_n_samples > remaining_size - self.subsample_size:
print('Warning: using all samples in corpus.')
# not using the last, incomplete subsample
# TODO: fix this to use all sentences
if self.do_subsample:
select_n_samples = remaining_size - self.subsample_size
else:
select_n_samples = remaining_size - 1
# reduce the intervals in proportion to the group size
if print_every > self.group_size:
print_every = print_every // self.group_size
if save_cp > self.group_size:
save_cp = save_cp // self.group_size
if max_iters and max_iters > self.group_size:
max_iters = max_iters // self.group_size
def _print_iteration():
print(f'After iteration {i+1}: Train set size {train_size}; ' \
+ f'Test set size {test_size}. ' \
+ f'Compound divergence {float(best_values[selected_set]["comdiv"])}; ' \
+ f'Atom divergence {float(best_values[selected_set]["atomdiv"])}')
def _save_division():
_, test_set_out = cp_file_names(output_dir, i,
float(best_values[selected_set]["atomdiv"]),
float(best_values[selected_set]["comdiv"]))
self.write_ids_to_file(
[self.sent_ids[ind] for ind in self.get_subset_indices(TEST_SET)], test_set_out)
best_values = {}
train_size, test_size = self.get_subset_sizes()
print(
f'Starting division. Train set size: {int(train_size)}, ' + \
f'Test set size: {int(test_size)}. ' + \
f'Remaining samples to divide: {select_n_samples}.'
)
for i in tqdm(range(select_n_samples)):
best_values[TEST_SET] = self._best_sentence(
*self._candidate_divergences(TEST_SET), mask=self.used_ids_mask)
selected_set = TEST_SET
selected_idx = best_values[selected_set]['idx']
self._add_sample_to_set(selected_set, selected_idx)
if self.do_subsample and i % self.subsample_iter == 0:
self._subsample(DISCARD_SET, [TEST_SET])
else:
self.candidate_com_sums[selected_set] += \
self.com_freq_matrix[selected_idx].to_dense()
self.candidate_atom_sums[selected_set] += \
self.atom_freq_matrix[selected_idx].to_dense()
if i % print_every == 0:
_print_iteration()
if i % save_cp == 0:
_save_division()
if i == max_iters:
break
print('Division complete.')
_print_iteration()
_save_division()
def create_parser():
"""Create the argument parser."""
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--data-dir", type=str, default=None,
help="Path to the directory containing the data files.")
arg_parser.add_argument("--algorithm", type=str, default='greedy',
choices=['greedy', 'sa', 'greedy_from_random_divide'],
help="Algorithm to use for splitting the corpus.")
arg_parser.add_argument("--min-test-percent", type=float, default=0.05,
help="Minimum ratio of test set size to train set size.")
arg_parser.add_argument("--max-test-percent", type=float, default=0.3,
help="Maximum ratio of test set size to train set size.")
arg_parser.add_argument("--subsample-size", type=int, default=None)
arg_parser.add_argument("--subsample-iter", type=int, default=None,
help="Subsample set every n iterations.")
arg_parser.add_argument("--move-a-sample-iter", type=int, default=20,
help="Move a sample from test set to train, and vice versa, every n iterations.")
arg_parser.add_argument("--group-size", type=int, default=None,
help="Group sentences to reduce the number of rows in the matrices.")
arg_parser.add_argument("--move-n", type=int, default=1,
help="Number of sentences to move at each iteration.")
arg_parser.add_argument("--atom-divergence", type=float, default=0.0)
arg_parser.add_argument("--compound-divergence", type=float, default=1.0)
arg_parser.add_argument("--presplit-train-test", default=None, type=str,
help="Dir containing train and test sets to initialise the split.")
arg_parser.add_argument("--leave-out", type=float, default=0.0)
arg_parser.add_argument("--max-iters", type=int, default=None)
arg_parser.add_argument("--random-seed", type=int, default=1234)
arg_parser.add_argument("--print-every", type=int, default=1000)
arg_parser.add_argument("--save-cp", type=int, default=50000,
help="Write test and train sets to txt file at every n iteration")
arg_parser.add_argument("--fixed-train-freqs", type=str, default=None,
help="Directory containing the fixed train set frequency sums.")
return arg_parser
def launch_from_empty_sets(args):
"""Run the greedy algorithm."""
divide_train_test = FromEmptySets(
data_dir=args.data_dir,
subsample_size=args.subsample_size,
subsample_iter=args.subsample_iter,
group_size=args.group_size,
presplit_train_test=args.presplit_train_test,
)
presplit = 'yes' if args.presplit_train_test else 'no'
use_n_samples = int((1 - args.leave_out) * divide_train_test.n_samples) - 1
output_dir_name = f'{args.data_dir}/comdiv{args.compound_divergence}' \
+ f'_atomdiv{args.atom_divergence}' \
+ f'_seed{args.random_seed}' \
+ f'_subsample{args.subsample_size}every{args.subsample_iter}iters' \
+ f'_groupsize{args.group_size}' \
+ f'_testsize{args.min_test_percent}to{args.max_test_percent}' \
+ f'_leaveout{args.leave_out}' \
+ f'_moveasampleevery{args.move_a_sample_iter}iters' \
+ f'_presplit{presplit}'
if path.isdir(output_dir_name):
# sys.exit('Output directory already exists. Exiting.')
output_dir_name += '_resumed'
makedirs(output_dir_name)
if args.presplit_train_test:
with open(output_dir_name + '/presplit_train_test.txt', 'w', encoding='utf-8') as f:
f.write(args.presplit_train_test)
print('Dividing the corpus, writing to', output_dir_name)
if args.fixed_train_freqs:
divide_train_test.divide_corpus_with_fixed_train_set(
fixed_train_freq_sums=args.fixed_train_freqs,
target_atom_div=args.atom_divergence,
target_com_div=args.compound_divergence,
select_n_samples=use_n_samples,
max_iters=args.max_iters,
print_every=args.print_every,
save_cp=args.save_cp,
output_dir=output_dir_name,
)
else:
divide_train_test.divide_corpus(
target_atom_div=args.atom_divergence,
target_com_div=args.compound_divergence,
min_test_percent=args.min_test_percent,
max_test_percent=args.max_test_percent,
select_n_samples=use_n_samples,
print_every=args.print_every,
max_iters=args.max_iters,
save_cp=args.save_cp,
output_dir=output_dir_name,
move_a_sample_iter=args.move_a_sample_iter,
)
divide_train_test.print_subset_atoms_and_compounds()
def main():
args = create_parser().parse_args()
print('Args:')
for arg in vars(args):
print(f'\t{arg}: {getattr(args, arg)}')
random.seed(args.random_seed)
launch_from_empty_sets(args)
if __name__ == "__main__":
main()