From 6e144f23c5c3c6be59fccac9e0ca31e1122c8076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Mendoza=20Garc=C3=ADa?= Date: Thu, 7 Sep 2023 23:18:09 +0200 Subject: [PATCH] aux folder created and slight modifications to enable similarities calculation and indexation --- aux/similarities.py | 71 ++++++++++++++++++++ aux/sims_representation.py | 66 ++++++++++++++++++ restapi/src/core/clients/base/solr_client.py | 2 +- restapi/src/core/clients/ewb_solr_client.py | 30 +++++++-- restapi/src/core/entities/model.py | 7 +- 5 files changed, 169 insertions(+), 7 deletions(-) create mode 100644 aux/similarities.py create mode 100644 aux/sims_representation.py diff --git a/aux/similarities.py b/aux/similarities.py new file mode 100644 index 0000000..8d70d69 --- /dev/null +++ b/aux/similarities.py @@ -0,0 +1,71 @@ +import numpy as np +import scipy.sparse as sparse +from sparse_dot_topn import awesome_cossim_topn +import time +import pandas as pd +import pathlib + + +dir = "/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/TMmodel" + +t_start = time.perf_counter() +TMfolder = pathlib.Path(dir) +thetas = sparse.load_npz(TMfolder.joinpath('thetas.npz')) +print(f"Shape of thetas: {np.shape(thetas)} ") +thetas_sqrt = np.sqrt(thetas) +thetas_col = thetas_sqrt.T +#topn=np.shape(thetas)[0] +#topn=numero_filas +#print(f"Topn: {topn}") +"""Duda: si le pongo topn igual al tamaño del dataset me da el siguiente error: +Traceback (most recent call last): + File "/export/usuarios_ml4ds/amendoza/top-comp/automatic_scripts/similarities.py", line 42, in + sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb) + File "/export/usuarios_ml4ds/amendoza/top-comp/.venv/lib/python3.10/site-packages/sparse_dot_topn/awesome_cossim_topn.py", line 102, in awesome_cossim_topn + alt_indices, alt_data = ct.sparse_dot_topn_extd( + File "sparse_dot_topn/sparse_dot_topn.pyx", line 149, in sparse_dot_topn.sparse_dot_topn.__pyx_fuse_0sparse_dot_topn_extd + File "sparse_dot_topn/sparse_dot_topn.pyx", line 219, in sparse_dot_topn.sparse_dot_topn.sparse_dot_topn_extd +OverflowError: value too large to convert to int + +Sin embargo, lo he ejecutado con topn=40000 y el shape de sims es tamaño del dataset x tamaño del dataset +Tiempo de ejecución: 28 min""" +topn = 300 +print(f"Topn: {topn}") +lb=0 +sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb) +sparse.save_npz(TMfolder.joinpath('distances.npz'), sims) + +t_end = time.perf_counter() +t_total = (t_end - t_start)/60 +print(f"Total computation time: {t_total}") +""" +TMfolder_sims = TMfolder.joinpath('distances.npz') +sims = sparse.load_npz(TMfolder_sims).toarray() +size = np.size(sims) +num_nonzeros = np.count_nonzero(sims) +percentage = ((size-num_nonzeros)/size)*100 +print(f"Percentage of zeros is: {percentage}") + +matriz_sin_ceros = np.ma.masked_where(sims == 0, sims) + +# Encuentra el valor mínimo y máximo de los elementos que no son cero +valor_minimo = np.min(matriz_sin_ceros) +valor_maximo = np.max(matriz_sin_ceros) + +print("Valor mínimo de los elementos que no son cero:", valor_minimo) +print("Valor máximo de los elementos que no son cero:", valor_maximo) + +sims_list = [] +print(f"Shape of sims: {np.shape(sims)} ") + +for i in range(0, len(sims)): + sims_list.append(np.count_nonzero(sims[i])) + +print(f"Length of sims_list: {len(sims_list)}") +maximo_elemento = max(sims_list) + +print(f"Num of non zero elements in the document with more similarities computed {maximo_elemento}") + +print(f"Non zero elements in one row of sims: {np.count_nonzero(sims[1])}") +print(f"Size of one row of sims: {np.size(sims[1])}") +""" diff --git a/aux/sims_representation.py b/aux/sims_representation.py new file mode 100644 index 0000000..2183134 --- /dev/null +++ b/aux/sims_representation.py @@ -0,0 +1,66 @@ +import encodings +import pathlib +import scipy.sparse as sparse +from typing import List +import time +import tqdm + +def get_doc_by_doc_sims(W, ids_corpus) -> List[str]: + """ + Calculates the similarity between each pair of documents in the corpus collection based on the document-topic distribution provided by the model being indexed. + + Parameters + ---------- + W: scipy.sparse.csr_matrix + Sparse matrix with the similarities between each pair of documents in the corpus collection. + ids_corpus: List[str] + List of ids of the documents in the corpus collection. + + Returns: + -------- + sims: List[str] + List of string represenation of the top similarities between each pair of documents in the corpus collection. + """ + + # Get the non-zero elements indices + non_zero_indices = W.nonzero() + + # Convert to a string + sim_str = \ + [' '.join([f"{ids_corpus[col]}|{W[row, col]}" for col in non_zero_indices[1] + [non_zero_indices[0] == row]][1:]) for row in range(W.shape[0])] + + return sim_str + + + +dir = pathlib.Path("/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/") + +sims = sparse.load_npz(dir.joinpath("TMmodel").joinpath("distances.npz")) +print(f"Sims obtained") + +def process_line(line): + id_ = line.rsplit(' 0 ')[0].strip() + id_ = int(id_.strip('"')) + return id_ + +with open(dir.joinpath("corpus.txt"), encoding="utf-8") as file: + ids_corpus = [process_line(line) for line in file] +print(f"Ids obtained") +print(f"Starting similarities representation...") +time_start = time.perf_counter() +sim_rpr = get_doc_by_doc_sims(sims, ids_corpus) +time_end = time.perf_counter() +print(f"Similarities representation finished in {time_end - time_start:0.4f} seconds") +print(f"Writing similarities representation to txt file...") + +# Escribir en el archivo +with open(dir.joinpath("TMmodel").joinpath('distances.txt'), 'w') as f: + for item in sim_rpr: + f.write("%s\n" % item) + +""" +# Leer el archivo +with open('distances.txt', 'r') as f: + mi_lista = [line.strip() for line in f] +""" \ No newline at end of file diff --git a/restapi/src/core/clients/base/solr_client.py b/restapi/src/core/clients/base/solr_client.py index e8506be..5ce10cc 100644 --- a/restapi/src/core/clients/base/solr_client.py +++ b/restapi/src/core/clients/base/solr_client.py @@ -207,7 +207,7 @@ def __init__(self, logger: logging.Logger) -> None: def _do_request(self, type: str, url: str, - timeout: int = 10, + timeout: int = None, **params) -> SolrResp: """Sends a requests to the given url with the given params and returns an object of the SolrResp class diff --git a/restapi/src/core/clients/ewb_solr_client.py b/restapi/src/core/clients/ewb_solr_client.py index f27ea2b..103f312 100644 --- a/restapi/src/core/clients/ewb_solr_client.py +++ b/restapi/src/core/clients/ewb_solr_client.py @@ -305,6 +305,10 @@ def index_model(self, model_path: str) -> None: else: self.logger.info( f"-- -- Collection {model_name} successfully created.") + + metadata = self.do_Q2("cordis") + self.logger.info( + f"-- -- Metadata of {self.corpus_col} before creating the model: {metadata}") # 3. Create Model object and extract info from the corpus to index model = Model(model_to_index) @@ -318,6 +322,10 @@ def index_model(self, model_path: str) -> None: return field_update = model.get_corpora_model_update( id=results.docs[0]["id"], action='add') + + metadata = self.do_Q2("cordis") + self.logger.info( + f"-- -- Metadata of {self.corpus_col} before adding the doc-tpc distribution: {metadata}") # 4. Add field for the doc-tpc distribution associated with the model being indexed in the document associated with the corpus self.logger.info( @@ -325,6 +333,10 @@ def index_model(self, model_path: str) -> None: self.index_documents(field_update, self.corpus_col, self.batch_size) self.logger.info( f"-- -- Indexing of model information of {model_name} info in {self.corpus_col} completed.") + + metadata = self.do_Q2("cordis") + self.logger.info( + f"-- -- Metadata of {self.corpus_col} before modifying the schema: {metadata}") # 5. Modify schema in corpus collection to add field for the doc-tpc distribution and the similarities associated with the model being indexed model_key = 'doctpc_' + model_name @@ -337,6 +349,11 @@ def index_model(self, model_path: str) -> None: f"-- -- Adding field {sim_model_key} in {corpus_name} collection") _, err = self.add_field_to_schema( col_name=corpus_name, field_name=sim_model_key, field_type='VectorFloatField') + + + metadata = self.do_Q2("cordis") + self.logger.info( + f"-- -- Metadata of {self.corpus_col} before indexing doc-tpc information: {metadata}") # 6. Index doc-tpc information in corpus collection self.logger.info( @@ -610,9 +627,10 @@ def do_Q1(self, resp = {'thetas': -1} return resp, sc - + def do_Q2(self, corpus_col: str) -> Union[dict, int]: - """Executes query Q2. + """ + Executes query Q2. Parameters ---------- @@ -646,11 +664,13 @@ def do_Q2(self, corpus_col: str) -> Union[dict, int]: return # Filter out metadata fields that we don't consider metadata + #meta_fields = [field for field in results.docs[0] + #['fields'] if field not in self.no_meta_fields and not field.startswith("doctpc_")] meta_fields = [field for field in results.docs[0] - ['fields'] if field not in self.no_meta_fields and not field.startswith("doctpc_")] - + ['fields'] if field not in self.no_meta_fields] + return {'metadata_fields': meta_fields}, sc - + def do_Q3(self, col: str) -> Union[dict, int]: """Executes query Q3. diff --git a/restapi/src/core/entities/model.py b/restapi/src/core/entities/model.py index 0792dfd..9bf50a8 100644 --- a/restapi/src/core/entities/model.py +++ b/restapi/src/core/entities/model.py @@ -273,12 +273,17 @@ def get_doc_by_doc_sims(W, ids_corpus) -> List[str]: return sim_str - sim_rpr = get_doc_by_doc_sims(self.sims, ids_corpus) + #sim_rpr = get_doc_by_doc_sims(self.sims, ids_corpus) + + with open(self.path_to_model.joinpath("TMmodel").joinpath('distances.txt'), 'r') as f: + sim_rpr = [line.strip() for line in f] self._logger.info( "Thetas and sims attained. Creating dataframe...") # Save the information in a dataframe df = pd.DataFrame(list(zip(ids_corpus, doc_tpc_rpr, sim_rpr)), columns=['id', model_key, sim_model_key]) + self._logger.info( + f"Dataframe created. Printing it:{df.columns.tolist()}") # self._logger.info("Merging dataframes...") # df = pd.merge(df, df_orig_ids, on=['id'], how='outer').fillna("")