-
Notifications
You must be signed in to change notification settings - Fork 3
/
run_workflow.py
58 lines (46 loc) · 3.14 KB
/
run_workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from OCMeta_coverage_classes import *
from Disciplines_Countries_classes import *
from utils import load_data
import argparse
def main(args):
# TO RUN ALL WORKFLOW
# args are: batch_size, max_workers, OCmeta_path, erih_path, doaj_path
meta_coverage = PlayaristsProcessor(args.batch_size, args.max_workers, args.oc_meta, args.erih_plus, args.doaj)
print("##### Starting OC Meta processing:")
meta_coverage.process_files()
print("##### Extracting Countries:")
countries = CountriesProcessor(meta_coverage, args.remove_megajournals)
countries_dict = countries.create_countries_dict()
countries_dict, no_country_venues = countries_dict
result_countries = CountsProcessor(meta_coverage, "SSH_Publications_and_Journals_by_Country.csv", args.remove_megajournals,)
result_countries = result_countries.counts(countries_dict, "Country")
# save dictionary to json
save_countries_dict = open("result_countries.json", "w")
json.dump(countries_dict, save_countries_dict, indent = 6)
save_countries_dict.close()
print("##### These venues have no country specified", no_country_venues)
print("##### Extracting Disciplines:")
disciplines = DisciplinesProcessor(meta_coverage, args.remove_megajournals)
disciplines_dict = disciplines.create_disciplines_dict()
result_disciplines = CountsProcessor(meta_coverage, "SSH_Publications_by_Discipline.csv", args.remove_megajournals)
result_disciplines = result_disciplines.counts(disciplines_dict, "Discipline")
# save dictionary to json
save_disciplines_dict = open("result_disciplines.json", "w")
json.dump(disciplines_dict, save_disciplines_dict, indent = 6)
save_disciplines_dict.close()
print("##### Comparing EU and US:")
us_eu = Compare_US_EU(meta_coverage, args.remove_megajournals)
meta_coverage_us_eu= us_eu.compare_us_eu(args.erih_plus, countries_dict) #returns a tuple with the two datasets
meta_coverage_us, meta_coverage_eu, us_data, eu_data = meta_coverage_us_eu
us_eu.counts_us_eu(disciplines_dict, "Discipline", meta_coverage_us, "compareUS_EU/us_disciplines_count.csv")
us_eu.counts_us_eu(disciplines_dict, "Discipline", meta_coverage_eu, "compareUS_EU/eu_disciplines_count.csv")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=150, type=int, required=False, help="batch size: e.g. 100")
parser.add_argument("--max_workers", default=4, type=int, required=False, help="max_workers: e.g. 4")
parser.add_argument("--oc_meta", default="csv_dump", type=str, required=False, help="path to the OpenCitations Meta dataset")
parser.add_argument("--erih_plus", default="ERIHPLUSapprovedJournals.csv", type=str, required=False, help="path to the ERIH PLUS dataset")
parser.add_argument("--doaj", default="journalcsv__doaj_20230528_0035_utf8.csv", type=str, required=False, help="path to the DOAJ file")
parser.add_argument("--remove_megajournals", default=True, type=bool, required=False, help="exclude mega journals from analysis")
args = parser.parse_args()
main(args)