Skip to content

Commit

Permalink
aux folder created and slight modifications to enable similarities ca…
Browse files Browse the repository at this point in the history
…lculation and indexation
  • Loading branch information
Alberto Mendoza García committed Sep 7, 2023
1 parent ee61a93 commit 6e144f2
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 7 deletions.
71 changes: 71 additions & 0 deletions aux/similarities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import numpy as np
import scipy.sparse as sparse
from sparse_dot_topn import awesome_cossim_topn
import time
import pandas as pd
import pathlib


dir = "/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/TMmodel"

t_start = time.perf_counter()
TMfolder = pathlib.Path(dir)
thetas = sparse.load_npz(TMfolder.joinpath('thetas.npz'))
print(f"Shape of thetas: {np.shape(thetas)} ")
thetas_sqrt = np.sqrt(thetas)
thetas_col = thetas_sqrt.T
#topn=np.shape(thetas)[0]
#topn=numero_filas
#print(f"Topn: {topn}")
"""Duda: si le pongo topn igual al tamaño del dataset me da el siguiente error:
Traceback (most recent call last):
File "/export/usuarios_ml4ds/amendoza/top-comp/automatic_scripts/similarities.py", line 42, in <module>
sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb)
File "/export/usuarios_ml4ds/amendoza/top-comp/.venv/lib/python3.10/site-packages/sparse_dot_topn/awesome_cossim_topn.py", line 102, in awesome_cossim_topn
alt_indices, alt_data = ct.sparse_dot_topn_extd(
File "sparse_dot_topn/sparse_dot_topn.pyx", line 149, in sparse_dot_topn.sparse_dot_topn.__pyx_fuse_0sparse_dot_topn_extd
File "sparse_dot_topn/sparse_dot_topn.pyx", line 219, in sparse_dot_topn.sparse_dot_topn.sparse_dot_topn_extd
OverflowError: value too large to convert to int
Sin embargo, lo he ejecutado con topn=40000 y el shape de sims es tamaño del dataset x tamaño del dataset
Tiempo de ejecución: 28 min"""
topn = 300
print(f"Topn: {topn}")
lb=0
sims = awesome_cossim_topn(thetas_sqrt, thetas_col, topn, lb)
sparse.save_npz(TMfolder.joinpath('distances.npz'), sims)

t_end = time.perf_counter()
t_total = (t_end - t_start)/60
print(f"Total computation time: {t_total}")
"""
TMfolder_sims = TMfolder.joinpath('distances.npz')
sims = sparse.load_npz(TMfolder_sims).toarray()
size = np.size(sims)
num_nonzeros = np.count_nonzero(sims)
percentage = ((size-num_nonzeros)/size)*100
print(f"Percentage of zeros is: {percentage}")
matriz_sin_ceros = np.ma.masked_where(sims == 0, sims)
# Encuentra el valor mínimo y máximo de los elementos que no son cero
valor_minimo = np.min(matriz_sin_ceros)
valor_maximo = np.max(matriz_sin_ceros)
print("Valor mínimo de los elementos que no son cero:", valor_minimo)
print("Valor máximo de los elementos que no son cero:", valor_maximo)
sims_list = []
print(f"Shape of sims: {np.shape(sims)} ")
for i in range(0, len(sims)):
sims_list.append(np.count_nonzero(sims[i]))
print(f"Length of sims_list: {len(sims_list)}")
maximo_elemento = max(sims_list)
print(f"Num of non zero elements in the document with more similarities computed {maximo_elemento}")
print(f"Non zero elements in one row of sims: {np.count_nonzero(sims[1])}")
print(f"Size of one row of sims: {np.size(sims[1])}")
"""
66 changes: 66 additions & 0 deletions aux/sims_representation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import encodings
import pathlib
import scipy.sparse as sparse
from typing import List
import time
import tqdm

def get_doc_by_doc_sims(W, ids_corpus) -> List[str]:
"""
Calculates the similarity between each pair of documents in the corpus collection based on the document-topic distribution provided by the model being indexed.
Parameters
----------
W: scipy.sparse.csr_matrix
Sparse matrix with the similarities between each pair of documents in the corpus collection.
ids_corpus: List[str]
List of ids of the documents in the corpus collection.
Returns:
--------
sims: List[str]
List of string represenation of the top similarities between each pair of documents in the corpus collection.
"""

# Get the non-zero elements indices
non_zero_indices = W.nonzero()

# Convert to a string
sim_str = \
[' '.join([f"{ids_corpus[col]}|{W[row, col]}" for col in non_zero_indices[1]
[non_zero_indices[0] == row]][1:]) for row in range(W.shape[0])]

return sim_str



dir = pathlib.Path("/export/usuarios_ml4ds/amendoza/Intelcomp/EWB/data/source/Mallet-20/")

sims = sparse.load_npz(dir.joinpath("TMmodel").joinpath("distances.npz"))
print(f"Sims obtained")

def process_line(line):
id_ = line.rsplit(' 0 ')[0].strip()
id_ = int(id_.strip('"'))
return id_

with open(dir.joinpath("corpus.txt"), encoding="utf-8") as file:
ids_corpus = [process_line(line) for line in file]
print(f"Ids obtained")
print(f"Starting similarities representation...")
time_start = time.perf_counter()
sim_rpr = get_doc_by_doc_sims(sims, ids_corpus)
time_end = time.perf_counter()
print(f"Similarities representation finished in {time_end - time_start:0.4f} seconds")
print(f"Writing similarities representation to txt file...")

# Escribir en el archivo
with open(dir.joinpath("TMmodel").joinpath('distances.txt'), 'w') as f:
for item in sim_rpr:
f.write("%s\n" % item)

"""
# Leer el archivo
with open('distances.txt', 'r') as f:
mi_lista = [line.strip() for line in f]
"""
2 changes: 1 addition & 1 deletion restapi/src/core/clients/base/solr_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def __init__(self, logger: logging.Logger) -> None:
def _do_request(self,
type: str,
url: str,
timeout: int = 10,
timeout: int = None,
**params) -> SolrResp:
"""Sends a requests to the given url with the given params and returns an object of the SolrResp class
Expand Down
30 changes: 25 additions & 5 deletions restapi/src/core/clients/ewb_solr_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ def index_model(self, model_path: str) -> None:
else:
self.logger.info(
f"-- -- Collection {model_name} successfully created.")

metadata = self.do_Q2("cordis")
self.logger.info(
f"-- -- Metadata of {self.corpus_col} before creating the model: {metadata}")

# 3. Create Model object and extract info from the corpus to index
model = Model(model_to_index)
Expand All @@ -318,13 +322,21 @@ def index_model(self, model_path: str) -> None:
return
field_update = model.get_corpora_model_update(
id=results.docs[0]["id"], action='add')

metadata = self.do_Q2("cordis")
self.logger.info(
f"-- -- Metadata of {self.corpus_col} before adding the doc-tpc distribution: {metadata}")

# 4. Add field for the doc-tpc distribution associated with the model being indexed in the document associated with the corpus
self.logger.info(
f"-- -- Indexing model information of {model_name} in {self.corpus_col} starts.")
self.index_documents(field_update, self.corpus_col, self.batch_size)
self.logger.info(
f"-- -- Indexing of model information of {model_name} info in {self.corpus_col} completed.")

metadata = self.do_Q2("cordis")
self.logger.info(
f"-- -- Metadata of {self.corpus_col} before modifying the schema: {metadata}")

# 5. Modify schema in corpus collection to add field for the doc-tpc distribution and the similarities associated with the model being indexed
model_key = 'doctpc_' + model_name
Expand All @@ -337,6 +349,11 @@ def index_model(self, model_path: str) -> None:
f"-- -- Adding field {sim_model_key} in {corpus_name} collection")
_, err = self.add_field_to_schema(
col_name=corpus_name, field_name=sim_model_key, field_type='VectorFloatField')


metadata = self.do_Q2("cordis")
self.logger.info(
f"-- -- Metadata of {self.corpus_col} before indexing doc-tpc information: {metadata}")

# 6. Index doc-tpc information in corpus collection
self.logger.info(
Expand Down Expand Up @@ -610,9 +627,10 @@ def do_Q1(self,
resp = {'thetas': -1}

return resp, sc

def do_Q2(self, corpus_col: str) -> Union[dict, int]:
"""Executes query Q2.
"""
Executes query Q2.
Parameters
----------
Expand Down Expand Up @@ -646,11 +664,13 @@ def do_Q2(self, corpus_col: str) -> Union[dict, int]:
return

# Filter out metadata fields that we don't consider metadata
#meta_fields = [field for field in results.docs[0]
#['fields'] if field not in self.no_meta_fields and not field.startswith("doctpc_")]
meta_fields = [field for field in results.docs[0]
['fields'] if field not in self.no_meta_fields and not field.startswith("doctpc_")]

['fields'] if field not in self.no_meta_fields]
return {'metadata_fields': meta_fields}, sc

def do_Q3(self, col: str) -> Union[dict, int]:
"""Executes query Q3.
Expand Down
7 changes: 6 additions & 1 deletion restapi/src/core/entities/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,12 +273,17 @@ def get_doc_by_doc_sims(W, ids_corpus) -> List[str]:

return sim_str

sim_rpr = get_doc_by_doc_sims(self.sims, ids_corpus)
#sim_rpr = get_doc_by_doc_sims(self.sims, ids_corpus)

with open(self.path_to_model.joinpath("TMmodel").joinpath('distances.txt'), 'r') as f:
sim_rpr = [line.strip() for line in f]
self._logger.info(
"Thetas and sims attained. Creating dataframe...")
# Save the information in a dataframe
df = pd.DataFrame(list(zip(ids_corpus, doc_tpc_rpr, sim_rpr)),
columns=['id', model_key, sim_model_key])
self._logger.info(
f"Dataframe created. Printing it:{df.columns.tolist()}")
# self._logger.info("Merging dataframes...")
# df = pd.merge(df, df_orig_ids, on=['id'], how='outer').fillna("")

Expand Down

0 comments on commit 6e144f2

Please sign in to comment.