-
Notifications
You must be signed in to change notification settings - Fork 3
/
Disciplines_Countries_classes.py
211 lines (148 loc) · 10.5 KB
/
Disciplines_Countries_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import pandas as pd
from retrieve_doaj_country import retrieve_doaj_country
import json
from utils import load_data, save_to_results
# ==================== PROCESSORS FOR RETRIEVING DISCIPLINES ======================= #
class ResultsProcessor(object):
def __init__(self, meta_coverage, remove_megajournals, meta_coverage_processed_files): # takes in input a PlayaristProcessor object
self.meta_coverage = meta_coverage
self.meta_df = pd.read_csv(meta_coverage_processed_files).iloc[4:] if remove_megajournals else pd.read_csv(meta_coverage_processed_files)
self.erih_df = meta_coverage.get_erih_df()
self.doaj_df = meta_coverage.get_doaj_df()
self.remove_megajournals = remove_megajournals
class CountriesProcessor(ResultsProcessor):
def __init__(self, meta_coverage, remove_megajournals=False, meta_coverage_processed_files="results/SSH_Publications_in_OC_Meta_and_Open_Access_status.csv"):
super().__init__(meta_coverage, remove_megajournals, meta_coverage_processed_files)
self.doaj_df = self.doaj_df[["Journal ISSN (print version)", "Journal EISSN (online version)", "Country of publisher"]]
self.unmatched_countries = []
def create_countries_dict(self):
countr_dict = {}
merged_df = pd.merge(self.erih_df, self.meta_df, left_on='Journal ID', right_on='EP_id')
for idx, row in merged_df.iterrows():
if pd.isna(row["Country of Publication"]):
self.unmatched_countries.append(row["Journal ID"])
else:
if len(countr_dict) == 0:
countries = row["Country of Publication"].split(', ')
countr_dict= {key: [row["Journal ID"]] for key in countries}
else:
countries = set(row["Country of Publication"].split(', '))
keys = set(countr_dict.keys())
diff = countries - keys
if len(diff) == 0:
for key in countries:
countr_dict[key].append(row["Journal ID"])
else:
countr_dict.update({key: [] for key in diff})
for key in countries:
countr_dict[key].append(row["Journal ID"])
complete_country_dict = retrieve_doaj_country(self.unmatched_countries, merged_df, self.doaj_df, countr_dict)
return complete_country_dict # a tuple (countr_dict, unmatched_df)
class DisciplinesProcessor(ResultsProcessor):
def __init__(self, meta_coverage, remove_megajournals=False, meta_coverage_processed_files="results/SSH_Publications_in_OC_Meta_and_Open_Access_status.csv"):
super().__init__(meta_coverage, remove_megajournals, meta_coverage_processed_files)
def create_disciplines_dict(self):
disc_dict = {}
merged_df = pd.merge(self.erih_df, self.meta_df, left_on='Journal ID', right_on='EP_id')
for idx, row in merged_df.iterrows():
#disciplines
if len(disc_dict) == 0:
disciplines = row["ERIH PLUS Disciplines"].split(', ')
disc_dict= {key: [row["Journal ID"]] for key in disciplines}
else:
disciplines = set(row["ERIH PLUS Disciplines"].split(', '))
keys = set(disc_dict.keys())
diff = disciplines - keys
if len(diff) == 0:
for key in disciplines:
disc_dict[key].append(row["Journal ID"])
else:
disc_dict.update({key: [] for key in diff})
for key in disciplines:
disc_dict[key].append(row["Journal ID"])
return disc_dict
# ================= COUNTS AND CSV EXPORT ====================== #
class CountsProcessor(ResultsProcessor):
def __init__(self, meta_df, export_path, remove_megajournals=False, meta_coverage_processed_files="results/SSH_Publications_in_OC_Meta_and_Open_Access_status.csv"):
self.export_path = export_path
super().__init__(meta_df, remove_megajournals, meta_coverage_processed_files)
def counts(self, dictionary, label): #dictionary is a DisciplinesCountriesProcessor object
meta_coverage = self.meta_df[["EP_id", "Publications_in_venue"]]
count_df = pd.DataFrame(columns=[str(label),'Journal_count','Publication_count'])
for key, value in dictionary.items():
venue_in_OCMeta_count = meta_coverage[meta_coverage["EP_id"].isin(value)]
venue_in_OCMeta_count.reset_index(drop=True, inplace=True)
venue_per_variable = len(venue_in_OCMeta_count)
pub_per_variable = venue_in_OCMeta_count['Publications_in_venue'].sum()
#create df row
new_row = pd.DataFrame([{str(label) : key ,'Journal_count' : venue_per_variable , 'Publication_count' : pub_per_variable}])
count_df = pd.concat([count_df, new_row], ignore_index = True)
count_df = count_df.sort_values('Publication_count', ascending=False)
save_to_results(count_df, self.export_path)
return count_df
# ==================== US-EU comparison ====================== #
class Compare_US_EU(ResultsProcessor):
def __init__(self, meta_coverage, remove_megajournals=False, meta_coverage_processed_files="results/SSH_Publications_in_OC_Meta_and_Open_Access_status.csv"):
super().__init__(meta_coverage, remove_megajournals, meta_coverage_processed_files)
def compare_us_eu(self, erih_ds, countries_dict):
# loading data
erih = load_data(erih_ds)
countries_j = countries_dict
#with open(countries_dict) as f:
#countries_j = json.load(f)
#with open(disciplines_dict) as f:
#disciplines_j = json.load(f)
# filtering for EU journals
eu_ID = []
eu_contries = ["Spain", "Romania", "Sweden", "United Kingdom", "Bulgaria", "Italy", "France", "Belgium", "Greece", "Poland", "Slovenia", "Albania", "Czechia", "Serbia", "Ukraine", "Netherlands", "Hungary", "Germany", "Italy, Norway", "Portugal", "Estonia", "Norway", "Bosnia and Herzegovina", "Slovakia", "Macedonia", "Lithuania", "Denmark", "Latvia", "Switzerland", "Finland", "United Kingdom", "Montenegro", "Ireland", "Croatia", "Austria", "Moldova", "Cyprus", "Germany, United Kingdom", "Czechia, Finland", "United Kingdom, Netherlands", "Luxembourg", "Iceland", "United Kingdom, Norway", "Malta", "Switzerland, United Kingdom", "Spain, United Kingdom", "Germany, Italy", "Belarus"]
for country in countries_j:
if country in eu_contries:
eu_ID.extend(list(countries_j[country]))
eu_info = erih[erih["Journal ID"].isin(eu_ID)]
# filtering for US journals
us_ID = list(countries_j["United States"])
us_info = erih[erih["Journal ID"].isin(us_ID)] # 1120 rows vs 1161 of results
# find how many disciplines add column to erih plus filtered
def count_disciplines_per_journal(df):
count_list = []
for idx, row in df.iterrows():
disc = row["ERIH PLUS Disciplines"].split(", ")
count_list.append(len(disc))
df["disc_count"] = count_list
return df
us = count_disciplines_per_journal(us_info)
eu = count_disciplines_per_journal(eu_info)
# now filter SSH_Publications_in_OC_Meta_and_Open_Access_status.csv to eu and us erih info
meta_processed = self.meta_df
us_meta = pd.merge(meta_processed, us, left_on="EP_id", right_on="Journal ID", how="inner") # keep only those present in meta
eu_meta = pd.merge(meta_processed, eu, left_on="EP_id", right_on="Journal ID", how="inner")
# DATASET US_DATA and EU_DATA
us_data = us_meta[["EP_id", "Publications_in_venue", "Original Title", "Country of Publication", "ERIH PLUS Disciplines", "disc_count"]]
us_data = us_data.rename(columns={"Original Title": "Original_Title", "Country of Publication": "Country_of_Publication", "ERIH PLUS Disciplines" : "ERIH_PLUS_Disciplines"})
eu_data = eu_meta[["EP_id", "Publications_in_venue", "Original Title", "Country of Publication", "ERIH PLUS Disciplines", "disc_count"]]
eu_data = eu_data.rename(columns={"Original Title": "Original_Title", "Country of Publication": "Country_of_Publication", "ERIH PLUS Disciplines" : "ERIH_PLUS_Disciplines"})
save_to_results(us_data, "compareUS_EU/us_data.csv")
save_to_results(eu_data, "compareUS_EU/eu_data.csv")
# DATASET META_COVERAGE_EU and META_COVERAGE_US
meta_coverage_eu = eu_meta[["OC_omid", "issn", "EP_id", "Publications_in_venue", "Open_Access"]]
meta_coverage_us = us_meta[["OC_omid", "issn", "EP_id", "Publications_in_venue", "Open_Access"]]
save_to_results(meta_coverage_eu, "compareUS_EU/meta_coverage_eu.csv")
save_to_results(meta_coverage_us, "compareUS_EU/meta_coverage_us.csv")
return meta_coverage_us, meta_coverage_eu, us_data, eu_data
# RESULT DATASET WITH PUBLICATIONS AND JOURNALS PER DISCIPLINE
def counts_us_eu(self, disciplines_dict, label, meta_coverage_us_or_eu, export_path):
meta_coverage = meta_coverage_us_or_eu
count_df = pd.DataFrame(columns=[str(label),'Journal_count','Publication_count'])
for key, value in disciplines_dict.items():
venue_in_OCMeta_count = meta_coverage[meta_coverage["EP_id"].isin(value)]
venue_in_OCMeta_count.reset_index(drop=True, inplace=True)
venue_per_variable = len(venue_in_OCMeta_count)
pub_per_variable = venue_in_OCMeta_count['Publications_in_venue'].sum()
#create df row
new_row = pd.DataFrame([{str(label) : key ,'Journal_count' : venue_per_variable , 'Publication_count' : pub_per_variable}])
count_df = pd.concat([count_df, new_row], ignore_index = True)
count_df = count_df.sort_values('Publication_count', ascending=False)
save_to_results(count_df, export_path)
# since in this df a single publication and journal counts multiple times for each discipline,
# how can we check that the result is correct?
return count_df