diff --git a/src/data_manager.py b/src/data_manager.py index 74aeac1..f7908b7 100644 --- a/src/data_manager.py +++ b/src/data_manager.py @@ -386,7 +386,8 @@ def sample_sub_set_from_folder(path2texts, selected_cols, # By default, neither corpus cleaning nor language filtering are done clean_corpus = corpus_name in { 'SemanticScholar', 'SemanticScholar_emb', 'patstat', 'patstat_emb', - 'AEI_projects', 'CORDIS.parquet', 'S2CS.parquet'} + 'AEI_projects', 'CORDIS.parquet', 'S2CS.parquet', + 'cordis_evoc_vs_all', 'patstat_intersection_vs_all'} remove_non_en = corpus_name in { 'SemanticScholar', 'SemanticScholar_emb', 'patstat', 'patstat_emb'}