aux folder created and slight modifications to enable similarities ca…

…lculation and indexation
IntelCompH2020 · Sep 7, 2023 · 6e144f2 · 6e144f2
1 parent ee61a93
commit 6e144f2
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 7 deletions.
diff --git a/aux/similarities.py b/aux/similarities.py
@@ -0,0 +1,71 @@
+import numpy as np
+import scipy.sparse as sparse
+from sparse_dot_topn import awesome_cossim_topn
+import time
+import pandas as pd
+import pathlib
+
+
+dir = "/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/TMmodel"
+
+t_start = time.perf_counter()
+TMfolder = pathlib.Path(dir)
+thetas = sparse.load_npz(TMfolder.joinpath('thetas.npz'))
+print(f"Shape of thetas: {np.shape(thetas)} ")
+thetas_sqrt = np.sqrt(thetas)
+thetas_col = thetas_sqrt.T
+#topn=np.shape(thetas)[0]
+#topn=numero_filas
+#print(f"Topn: {topn}")
+"""Duda: si le pongo topn igual al tamaño del dataset me da el siguiente error:
+Traceback (most recent call last):
+  File "/export/usuarios_ml4ds/amendoza/top-comp/automatic_scripts/similarities.py", line 42, in <module>
+    sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb)
+  File "/export/usuarios_ml4ds/amendoza/top-comp/.venv/lib/python3.10/site-packages/sparse_dot_topn/awesome_cossim_topn.py", line 102, in awesome_cossim_topn
+    alt_indices, alt_data = ct.sparse_dot_topn_extd(
+  File "sparse_dot_topn/sparse_dot_topn.pyx", line 149, in sparse_dot_topn.sparse_dot_topn.__pyx_fuse_0sparse_dot_topn_extd
+  File "sparse_dot_topn/sparse_dot_topn.pyx", line 219, in sparse_dot_topn.sparse_dot_topn.sparse_dot_topn_extd
+OverflowError: value too large to convert to int
+
+Sin embargo, lo he ejecutado con topn=40000 y el shape de sims es tamaño del dataset x tamaño del dataset
+Tiempo de ejecución: 28 min"""
+topn = 300
+print(f"Topn: {topn}")
+lb=0
+sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb)
+sparse.save_npz(TMfolder.joinpath('distances.npz'), sims)
+
+t_end = time.perf_counter()
+t_total = (t_end - t_start)/60
+print(f"Total computation time: {t_total}")
+"""
+TMfolder_sims = TMfolder.joinpath('distances.npz')
+sims = sparse.load_npz(TMfolder_sims).toarray()
+size = np.size(sims)
+num_nonzeros = np.count_nonzero(sims)
+percentage = ((size-num_nonzeros)/size)*100
+print(f"Percentage of zeros is: {percentage}")
+
+matriz_sin_ceros = np.ma.masked_where(sims == 0, sims)
+
+# Encuentra el valor mínimo y máximo de los elementos que no son cero
+valor_minimo = np.min(matriz_sin_ceros)
+valor_maximo = np.max(matriz_sin_ceros)
+
+print("Valor mínimo de los elementos que no son cero:", valor_minimo)
+print("Valor máximo de los elementos que no son cero:", valor_maximo)
+
+sims_list = []
+print(f"Shape of sims: {np.shape(sims)} ")
+
+for i in range(0, len(sims)):
+    sims_list.append(np.count_nonzero(sims[i]))
+
+print(f"Length of sims_list: {len(sims_list)}")
+maximo_elemento = max(sims_list)
+
+print(f"Num of non zero elements in the document with more similarities computed {maximo_elemento}")
+
+print(f"Non zero elements in one row of sims: {np.count_nonzero(sims[1])}")
+print(f"Size of one row of sims: {np.size(sims[1])}")
+"""
diff --git a/aux/sims_representation.py b/aux/sims_representation.py
@@ -0,0 +1,66 @@
+import encodings
+import pathlib
+import scipy.sparse as sparse
+from typing import List
+import time
+import tqdm
+
+def get_doc_by_doc_sims(W, ids_corpus) -> List[str]:
+    """
+    Calculates the similarity between each pair of documents in the corpus collection based on the document-topic distribution provided by the model being indexed.
+
+    Parameters
+    ----------
+    W: scipy.sparse.csr_matrix
+        Sparse matrix with the similarities between each pair of documents in the corpus collection.
+    ids_corpus: List[str]
+        List of ids of the documents in the corpus collection.
+
+    Returns:
+    --------
+    sims: List[str]
+        List of string represenation of the top similarities between each pair of documents in the corpus collection.
+    """
+
+    # Get the non-zero elements indices
+    non_zero_indices = W.nonzero()
+
+    # Convert to a string
+    sim_str = \
+        [' '.join([f"{ids_corpus[col]}|{W[row, col]}" for col in non_zero_indices[1]
+                    [non_zero_indices[0] == row]][1:]) for row in range(W.shape[0])]
+
+    return sim_str
+
+
+
+dir = pathlib.Path("/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/")
+
+sims = sparse.load_npz(dir.joinpath("TMmodel").joinpath("distances.npz"))
+print(f"Sims obtained")
+
+def process_line(line):
+    id_ = line.rsplit(' 0 ')[0].strip()
+    id_ = int(id_.strip('"'))
+    return id_
+
+with open(dir.joinpath("corpus.txt"), encoding="utf-8") as file:
+    ids_corpus = [process_line(line) for line in file]
+print(f"Ids obtained")
+print(f"Starting similarities representation...")
+time_start = time.perf_counter()
+sim_rpr = get_doc_by_doc_sims(sims, ids_corpus)
+time_end = time.perf_counter()
+print(f"Similarities representation finished in {time_end - time_start:0.4f} seconds")
+print(f"Writing similarities representation to txt file...")
+
+# Escribir en el archivo
+with open(dir.joinpath("TMmodel").joinpath('distances.txt'), 'w') as f:
+    for item in sim_rpr:
+        f.write("%s\n" % item)
+
+"""
+# Leer el archivo
+with open('distances.txt', 'r') as f:
+    mi_lista = [line.strip() for line in f]
+"""
diff --git a/restapi/src/core/clients/base/solr_client.py b/restapi/src/core/clients/base/solr_client.py
@@ -207,7 +207,7 @@ def __init__(self, logger: logging.Logger) -> None:
     def _do_request(self,
                     type: str,
                     url: str,
-                    timeout: int = 10,
+                    timeout: int = None,
                     **params) -> SolrResp:
         """Sends a requests to the given url with the given params and returns an object of the SolrResp class
 

diff --git a/restapi/src/core/clients/ewb_solr_client.py b/restapi/src/core/clients/ewb_solr_client.py
@@ -305,6 +305,10 @@ def index_model(self, model_path: str) -> None:
         else:
             self.logger.info(
                 f"-- -- Collection {model_name} successfully created.")
+
+        metadata = self.do_Q2("cordis")
+        self.logger.info(
+            f"-- -- Metadata of {self.corpus_col} before creating the model: {metadata}")
 
         # 3. Create Model object and extract info from the corpus to index
         model = Model(model_to_index)
@@ -318,13 +322,21 @@ def index_model(self, model_path: str) -> None:
             return
         field_update = model.get_corpora_model_update(
             id=results.docs[0]["id"], action='add')
+
+        metadata = self.do_Q2("cordis")
+        self.logger.info(
+            f"-- -- Metadata of {self.corpus_col} before adding the doc-tpc distribution: {metadata}")
 
         # 4. Add field for the doc-tpc distribution associated with the model being indexed in the document associated with the corpus
         self.logger.info(
             f"-- -- Indexing model information of {model_name} in {self.corpus_col} starts.")
         self.index_documents(field_update, self.corpus_col, self.batch_size)
         self.logger.info(
             f"-- -- Indexing of model information of {model_name} info in {self.corpus_col} completed.")
+
+        metadata = self.do_Q2("cordis")
+        self.logger.info(
+            f"-- -- Metadata of {self.corpus_col} before modifying the schema: {metadata}")  
 
         # 5. Modify schema in corpus collection to add field for the doc-tpc distribution and the similarities associated with the model being indexed
         model_key = 'doctpc_' + model_name
@@ -337,6 +349,11 @@ def index_model(self, model_path: str) -> None:
             f"-- -- Adding field {sim_model_key} in {corpus_name} collection")
         _, err = self.add_field_to_schema(
             col_name=corpus_name, field_name=sim_model_key, field_type='VectorFloatField')
+
+
+        metadata = self.do_Q2("cordis")
+        self.logger.info(
+            f"-- -- Metadata of {self.corpus_col} before indexing doc-tpc information: {metadata}")  
 
         # 6. Index doc-tpc information in corpus collection
         self.logger.info(
@@ -610,9 +627,10 @@ def do_Q1(self,
             resp = {'thetas': -1}
 
         return resp, sc
-
+    
     def do_Q2(self, corpus_col: str) -> Union[dict, int]:
-        """Executes query Q2.
+        """
+        Executes query Q2.
 
         Parameters
         ----------
@@ -646,11 +664,13 @@ def do_Q2(self, corpus_col: str) -> Union[dict, int]:
             return
 
         # Filter out metadata fields that we don't consider metadata
+        #meta_fields = [field for field in results.docs[0]
+                       #['fields'] if field not in self.no_meta_fields and not field.startswith("doctpc_")]
         meta_fields = [field for field in results.docs[0]
-                       ['fields'] if field not in self.no_meta_fields and not field.startswith("doctpc_")]
-
+                       ['fields'] if field not in self.no_meta_fields]
+        
         return {'metadata_fields': meta_fields}, sc
-
+    
     def do_Q3(self, col: str) -> Union[dict, int]:
         """Executes query Q3.
 

diff --git a/restapi/src/core/entities/model.py b/restapi/src/core/entities/model.py
@@ -273,12 +273,17 @@ def get_doc_by_doc_sims(W, ids_corpus) -> List[str]:
 
                 return sim_str
 
-            sim_rpr = get_doc_by_doc_sims(self.sims, ids_corpus)
+            #sim_rpr = get_doc_by_doc_sims(self.sims, ids_corpus)
+
+            with open(self.path_to_model.joinpath("TMmodel").joinpath('distances.txt'), 'r') as f:
+                sim_rpr = [line.strip() for line in f]
             self._logger.info(
                 "Thetas and sims attained. Creating dataframe...")
             # Save the information in a dataframe
             df = pd.DataFrame(list(zip(ids_corpus, doc_tpc_rpr, sim_rpr)),
                               columns=['id', model_key, sim_model_key])
+            self._logger.info(
+            f"Dataframe created. Printing it:{df.columns.tolist()}")
             # self._logger.info("Merging dataframes...")
             # df = pd.merge(df, df_orig_ids, on=['id'], how='outer').fillna("")