Skip to content

Commit

Permalink
NN-617 Extract source code & sequencing data
Browse files Browse the repository at this point in the history
  • Loading branch information
mominaatifdar committed Dec 11, 2024
1 parent 50d0d3c commit a5a580a
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 4 deletions.
30 changes: 30 additions & 0 deletions scraping/scraping/data_formatting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import json
import os

def format_data():
"""Publications having Data Availability and Code Availability separate are merged in a single entry"""

files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('udc.json')]

for file in files:
with open(file,'r', encoding="utf-8") as f:
data = json.load(f)

duplicates = set()
new_data = {}

for paper in data:
if paper['DOI'] in list(duplicates):
new_data[paper['DOI']]['paragraph'] += "\n"+ paper['paragraph']
else:
duplicates.add(paper['DOI'])
new_data.update({paper['DOI']:paper})

formated_data = []
for value in new_data.values():
formated_data.append(value)
with open(f"{file}_formatted.json","w") as f:
json.dump(formated_data, f, indent=4)

if __name__ == "__main__":
format_data()
8 changes: 4 additions & 4 deletions scraping/scraping/extract_data_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def process_json_files(directory):

"""Generalized filepath in future"""

file_path = "_Lancet Infectious Diseases, The_Signal Transduction and Targeted Therapy_Annals of the Rheumatic Diseases_udc.json"
file_path = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json"
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)

Expand All @@ -70,11 +70,11 @@ def process_json_files(directory):
"DOI": doi,
"Data Availability": data_availability
}
# print('Processing DOI:', doi)
print('Processing DOI:', doi)

try:
# start_time = time.time()
# result = send_to_mistral(input_data)
result = send_to_mistral(input_data)
# end_time = time.time()
# print("Execution time:", end_time-start_time)
results.append({
Expand All @@ -87,7 +87,7 @@ def process_json_files(directory):

# print(results)
# Save results to output file
output_file = "extraction_results_mistral.json"
output_file = "extraction_results_mistral_sciBulletin.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4)

Expand Down

0 comments on commit a5a580a

Please sign in to comment.