pending changes in datamanager and taskmanager

IntelCompH2020 · Dec 12, 2024 · a0bb9af · a0bb9af
1 parent aac8b1e
commit a0bb9af
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 5 deletions.
diff --git a/config/parameters.default.yaml b/config/parameters.default.yaml
@@ -1,11 +1,11 @@
 # Parameters for Project folder path.
 # Used for the IMT only. In general, the project folder and the dataset
 # can be specified as args of the command running the application
-project_folder_path: /data/DCmodels     
-dataset_path: /data/datasets            
+# project_folder_path: /data/DCmodels     
+# dataset_path: /data/datasets            
 # For debug mode, the following can be used:
-# project_folder_path: ../projects/debug_imt
-# dataset_path: ../datasets
+project_folder_path: ../projects
+dataset_path: ../logical_datasets
 
 # Parameters for the corpus loaders
 corpus:

diff --git a/src/data_manager.py b/src/data_manager.py
@@ -535,6 +535,29 @@ def sample_sub_set_from_folder(path2texts, selected_cols,
             logging.info(f'-- -- Raw corpus {corpus_name} read with '
                          f'{len(df_corpus)} documents')
 
+        elif corpus_name == 'SS_202403':
+
+            # Load data from parquet files
+            path2texts = self.path2corpus / 'corpus'
+            # Original fields are:
+            #  'actID', 'Order', 'ActivityType', 'Title', 'Keywords',
+            #  'ResearchAreas', 'DOI', 'Year', 'Publisher', 'ISSN', 'EISSN',
+            #  'ISBN', 'referencecount', 'citationcount',
+            #  'influentialcitationcount', 'paperAbstract'
+            columns = ['actID', 'Title', 'paperAbstract', 'Keywords']
+            # Map column names to normalized names
+            mapping = {'actID': 'id',
+                       'Title': 'title',
+                       'paperAbstract': 'description',
+                       'Keywords': 'keywords'}
+
+            breakpoint()
+
+            df_corpus = self._load_parquet(path2texts, columns, mapping)
+
+            logging.info(f'-- -- Raw corpus {corpus_name} read with '
+                         f'{len(df_corpus)} documents')
+
         elif corpus_name == 'cordis_evoc_vs_all':
 
             # Original fields are:
@@ -1495,7 +1518,8 @@ def __get_corpus_from_logical_dataset(self, sampling_factor, corpus_name):
             with ProgressBar():
                 df_corpus = dfsmall.compute()
 
-            if 'categoryfld' not in dataset or dataset['categoryfld'] == "":
+            if ('categoryfld' not in dataset or dataset['categoryfld'] == ""
+                    or dataset['categoryfld'] is None):
                 selected_cols = np.array([dataset['idfld'],
                                           dataset['titlefld'],
                                           dataset['textfld']])

diff --git a/src/task_manager.py b/src/task_manager.py
@@ -882,6 +882,8 @@ def train_PUmodel(self, max_imbalance: float = 3.0, nmax: int = 400,
             Number of training epoch
         """
 
+        breakpoint()
+
         if self.df_dataset is None:
             logging.warning("-- No model is loaded. "
                             "You must load or create a set of labels first")
@@ -2022,6 +2024,7 @@ def on_create_list_of_keywords(
         """
 
         self.setup()
+        breakpoint()
         self.load_corpus(corpus_name)
         if keyword_list == '__all_AI':
             self.keywords = (