-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
aux folder created and slight modifications to enable similarities ca…
…lculation and indexation
- Loading branch information
Alberto Mendoza García
committed
Sep 7, 2023
1 parent
ee61a93
commit 6e144f2
Showing
5 changed files
with
169 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import numpy as np | ||
import scipy.sparse as sparse | ||
from sparse_dot_topn import awesome_cossim_topn | ||
import time | ||
import pandas as pd | ||
import pathlib | ||
|
||
|
||
dir = "/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/TMmodel" | ||
|
||
t_start = time.perf_counter() | ||
TMfolder = pathlib.Path(dir) | ||
thetas = sparse.load_npz(TMfolder.joinpath('thetas.npz')) | ||
print(f"Shape of thetas: {np.shape(thetas)} ") | ||
thetas_sqrt = np.sqrt(thetas) | ||
thetas_col = thetas_sqrt.T | ||
#topn=np.shape(thetas)[0] | ||
#topn=numero_filas | ||
#print(f"Topn: {topn}") | ||
"""Duda: si le pongo topn igual al tamaño del dataset me da el siguiente error: | ||
Traceback (most recent call last): | ||
File "/export/usuarios_ml4ds/amendoza/top-comp/automatic_scripts/similarities.py", line 42, in <module> | ||
sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb) | ||
File "/export/usuarios_ml4ds/amendoza/top-comp/.venv/lib/python3.10/site-packages/sparse_dot_topn/awesome_cossim_topn.py", line 102, in awesome_cossim_topn | ||
alt_indices, alt_data = ct.sparse_dot_topn_extd( | ||
File "sparse_dot_topn/sparse_dot_topn.pyx", line 149, in sparse_dot_topn.sparse_dot_topn.__pyx_fuse_0sparse_dot_topn_extd | ||
File "sparse_dot_topn/sparse_dot_topn.pyx", line 219, in sparse_dot_topn.sparse_dot_topn.sparse_dot_topn_extd | ||
OverflowError: value too large to convert to int | ||
Sin embargo, lo he ejecutado con topn=40000 y el shape de sims es tamaño del dataset x tamaño del dataset | ||
Tiempo de ejecución: 28 min""" | ||
topn = 300 | ||
print(f"Topn: {topn}") | ||
lb=0 | ||
sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb) | ||
sparse.save_npz(TMfolder.joinpath('distances.npz'), sims) | ||
|
||
t_end = time.perf_counter() | ||
t_total = (t_end - t_start)/60 | ||
print(f"Total computation time: {t_total}") | ||
""" | ||
TMfolder_sims = TMfolder.joinpath('distances.npz') | ||
sims = sparse.load_npz(TMfolder_sims).toarray() | ||
size = np.size(sims) | ||
num_nonzeros = np.count_nonzero(sims) | ||
percentage = ((size-num_nonzeros)/size)*100 | ||
print(f"Percentage of zeros is: {percentage}") | ||
matriz_sin_ceros = np.ma.masked_where(sims == 0, sims) | ||
# Encuentra el valor mínimo y máximo de los elementos que no son cero | ||
valor_minimo = np.min(matriz_sin_ceros) | ||
valor_maximo = np.max(matriz_sin_ceros) | ||
print("Valor mínimo de los elementos que no son cero:", valor_minimo) | ||
print("Valor máximo de los elementos que no son cero:", valor_maximo) | ||
sims_list = [] | ||
print(f"Shape of sims: {np.shape(sims)} ") | ||
for i in range(0, len(sims)): | ||
sims_list.append(np.count_nonzero(sims[i])) | ||
print(f"Length of sims_list: {len(sims_list)}") | ||
maximo_elemento = max(sims_list) | ||
print(f"Num of non zero elements in the document with more similarities computed {maximo_elemento}") | ||
print(f"Non zero elements in one row of sims: {np.count_nonzero(sims[1])}") | ||
print(f"Size of one row of sims: {np.size(sims[1])}") | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import encodings | ||
import pathlib | ||
import scipy.sparse as sparse | ||
from typing import List | ||
import time | ||
import tqdm | ||
|
||
def get_doc_by_doc_sims(W, ids_corpus) -> List[str]: | ||
""" | ||
Calculates the similarity between each pair of documents in the corpus collection based on the document-topic distribution provided by the model being indexed. | ||
Parameters | ||
---------- | ||
W: scipy.sparse.csr_matrix | ||
Sparse matrix with the similarities between each pair of documents in the corpus collection. | ||
ids_corpus: List[str] | ||
List of ids of the documents in the corpus collection. | ||
Returns: | ||
-------- | ||
sims: List[str] | ||
List of string represenation of the top similarities between each pair of documents in the corpus collection. | ||
""" | ||
|
||
# Get the non-zero elements indices | ||
non_zero_indices = W.nonzero() | ||
|
||
# Convert to a string | ||
sim_str = \ | ||
[' '.join([f"{ids_corpus[col]}|{W[row, col]}" for col in non_zero_indices[1] | ||
[non_zero_indices[0] == row]][1:]) for row in range(W.shape[0])] | ||
|
||
return sim_str | ||
|
||
|
||
|
||
dir = pathlib.Path("/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/") | ||
|
||
sims = sparse.load_npz(dir.joinpath("TMmodel").joinpath("distances.npz")) | ||
print(f"Sims obtained") | ||
|
||
def process_line(line): | ||
id_ = line.rsplit(' 0 ')[0].strip() | ||
id_ = int(id_.strip('"')) | ||
return id_ | ||
|
||
with open(dir.joinpath("corpus.txt"), encoding="utf-8") as file: | ||
ids_corpus = [process_line(line) for line in file] | ||
print(f"Ids obtained") | ||
print(f"Starting similarities representation...") | ||
time_start = time.perf_counter() | ||
sim_rpr = get_doc_by_doc_sims(sims, ids_corpus) | ||
time_end = time.perf_counter() | ||
print(f"Similarities representation finished in {time_end - time_start:0.4f} seconds") | ||
print(f"Writing similarities representation to txt file...") | ||
|
||
# Escribir en el archivo | ||
with open(dir.joinpath("TMmodel").joinpath('distances.txt'), 'w') as f: | ||
for item in sim_rpr: | ||
f.write("%s\n" % item) | ||
|
||
""" | ||
# Leer el archivo | ||
with open('distances.txt', 'r') as f: | ||
mi_lista = [line.strip() for line in f] | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters