Skip to content

Commit

Permalink
pending changes in datamanager and taskmanager
Browse files Browse the repository at this point in the history
  • Loading branch information
Orieus committed Dec 12, 2024
1 parent aac8b1e commit a0bb9af
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 5 deletions.
8 changes: 4 additions & 4 deletions config/parameters.default.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Parameters for Project folder path.
# Used for the IMT only. In general, the project folder and the dataset
# can be specified as args of the command running the application
project_folder_path: /data/DCmodels
dataset_path: /data/datasets
# project_folder_path: /data/DCmodels
# dataset_path: /data/datasets
# For debug mode, the following can be used:
# project_folder_path: ../projects/debug_imt
# dataset_path: ../datasets
project_folder_path: ../projects
dataset_path: ../logical_datasets

# Parameters for the corpus loaders
corpus:
Expand Down
26 changes: 25 additions & 1 deletion src/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,29 @@ def sample_sub_set_from_folder(path2texts, selected_cols,
logging.info(f'-- -- Raw corpus {corpus_name} read with '
f'{len(df_corpus)} documents')

elif corpus_name == 'SS_202403':

# Load data from parquet files
path2texts = self.path2corpus / 'corpus'
# Original fields are:
# 'actID', 'Order', 'ActivityType', 'Title', 'Keywords',
# 'ResearchAreas', 'DOI', 'Year', 'Publisher', 'ISSN', 'EISSN',
# 'ISBN', 'referencecount', 'citationcount',
# 'influentialcitationcount', 'paperAbstract'
columns = ['actID', 'Title', 'paperAbstract', 'Keywords']
# Map column names to normalized names
mapping = {'actID': 'id',
'Title': 'title',
'paperAbstract': 'description',
'Keywords': 'keywords'}

breakpoint()

df_corpus = self._load_parquet(path2texts, columns, mapping)

logging.info(f'-- -- Raw corpus {corpus_name} read with '
f'{len(df_corpus)} documents')

elif corpus_name == 'cordis_evoc_vs_all':

# Original fields are:
Expand Down Expand Up @@ -1495,7 +1518,8 @@ def __get_corpus_from_logical_dataset(self, sampling_factor, corpus_name):
with ProgressBar():
df_corpus = dfsmall.compute()

if 'categoryfld' not in dataset or dataset['categoryfld'] == "":
if ('categoryfld' not in dataset or dataset['categoryfld'] == ""
or dataset['categoryfld'] is None):
selected_cols = np.array([dataset['idfld'],
dataset['titlefld'],
dataset['textfld']])
Expand Down
3 changes: 3 additions & 0 deletions src/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,8 @@ def train_PUmodel(self, max_imbalance: float = 3.0, nmax: int = 400,
Number of training epoch
"""

breakpoint()

if self.df_dataset is None:
logging.warning("-- No model is loaded. "
"You must load or create a set of labels first")
Expand Down Expand Up @@ -2022,6 +2024,7 @@ def on_create_list_of_keywords(
"""

self.setup()
breakpoint()
self.load_corpus(corpus_name)
if keyword_list == '__all_AI':
self.keywords = (
Expand Down

0 comments on commit a0bb9af

Please sign in to comment.