From f7b47a85b055a38bee75b41a8d2a1e25c4ee638a Mon Sep 17 00:00:00 2001 From: Jan Reineke Date: Tue, 22 Aug 2023 08:51:32 +0200 Subject: [PATCH 1/3] First version of Openaire search; with mapping to schema.org types --- main.py | 40 ++++------- objects.py | 107 ++++++++++++++++-------------- sources/openaire.py | 157 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+), 78 deletions(-) create mode 100644 sources/openaire.py diff --git a/main.py b/main.py index af01732..3e9b430 100644 --- a/main.py +++ b/main.py @@ -6,11 +6,11 @@ from objects import Article, Organization, Person, Dataset, Project from flask import Flask, render_template, request, make_response import threading -from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee #eulg +from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee, \ + openaire # eulg # import dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris # , eulg import details_page - logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf')) logger = logging.getLogger('nfdi_search_engine') app = Flask(__name__) @@ -29,6 +29,7 @@ def index(): return response + @app.route('/results', methods=['POST', 'GET']) def search_results(): # The search-session cookie setting can still be None if a user enters the @@ -49,12 +50,12 @@ def search_results(): 'events': [], 'fundings': [], 'others': [] - } + } threads = [] # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module - sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris] + sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, openaire] # sources = [dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris] for source in sources: @@ -65,15 +66,15 @@ def search_results(): for t in threads: t.join() # print(t.is_alive()) - + logger.info(f'Got {len(results["publications"])} publications') logger.info(f'Got {len(results["researchers"])} researchers') logger.info(f'Got {len(results["resources"])} resources') logger.info(f'Got {len(results["organizations"])} organizations') logger.info(f'Got {len(results["events"])} events') logger.info(f'Got {len(results["fundings"])} fundings') - logger.info(f'Got {len(results["others"])} others') - + logger.info(f'Got {len(results["others"])} others') + return render_template('results.html', results=results, search_term=search_term) @@ -90,6 +91,7 @@ def chatbox(): return response + @app.route('/publication-details') def publication_details(): response = make_response(render_template('publication-details.html')) @@ -118,7 +120,6 @@ def resources_details(): return response - @app.route('/details', methods=['POST', 'GET']) def details(): if request.method == 'GET': @@ -141,9 +142,7 @@ def details(): if __name__ == "__main__": app.run(host='0.0.0.0', port=5002, debug=True) - - -#region OLD CODE +# region OLD CODE # @app.route('/index-old') # def index_new(): @@ -234,24 +233,11 @@ def details(): # data[object_mappings[result_type]].append(result) # else: # logger.warning(f"Type {result_type} of result not yet handled") - - + + # # Remove items without results # data = dict((k, result) for k, result in data.items() if result) # return render_template('result.html', data=data, search_term=search_term) -#endregion - - - - - - - - - - - - - +# endregion diff --git a/objects.py b/objects.py index e7ec14f..74c4a2d 100644 --- a/objects.py +++ b/objects.py @@ -2,76 +2,80 @@ import dataclasses from dataclasses import dataclass, fields, field + @dataclass class thing: name: str = "" alternateName: str = "" description: str = "" url: str = "" - image: str = "" #url of the image + image: str = "" # url of the image identifier: str = "" source: str = "" def __post_init__(self): - # Loop through the fields + # Loop through the fields for field in fields(self): # If there is a default and the value of the field is none we can assign a value if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: setattr(self, field.name, field.default) + @dataclass class Organization(thing): address: str = "" email: str = "" legalName: str = "" location: str = "" - logo: str = "" # url + logo: str = "" # url numberOfEmployees: str = "" telephone: str = "" foundingDate: str = "" keywords: List[str] = field(default_factory=list) def __post_init__(self): - # Loop through the fields + # Loop through the fields for field in fields(self): # If there is a default and the value of the field is none we can assign a value if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: setattr(self, field.name, field.default) + @dataclass class Person(thing): additionalName: str = "" - address: str = "" #this should be a list - affiliation: Organization = None #this should be a list - alumniOf: Organization = None #this should be a list + address: str = "" # this should be a list + affiliation: Organization = None # this should be a list + alumniOf: Organization = None # this should be a list birthDate: str = "" birthPlace: str = "" deathDate: str = "" deathPlace: str = "" - email: str = "" #this should be a list + email: str = "" # this should be a list familyName: str = "" gender: str = "" - givenName: str = "" # usually the first name - homeLocation: str = "" #this should be a list - honorificPrefix: str = "" #An honorific prefix preceding a Person's name such as Dr/Mrs/Mr. #this should be a list - honorificSuffix: str = "" #An honorific suffix following a Person's name such as M.D./PhD/MSCSW. #this should be a list - jobTitle: str = "" #this should be a list - nationality: str = "" # we can later link it to country #this should be a list - workLocation: str = "" #this should be a list - worksFor: Organization = None #this should be a list - + givenName: str = "" # usually the first name + homeLocation: str = "" # this should be a list + honorificPrefix: str = "" # An honorific prefix preceding a Person's name such as Dr/Mrs/Mr. #this should be a list + honorificSuffix: str = "" # An honorific suffix following a Person's name such as M.D./PhD/MSCSW. #this should be a list + jobTitle: str = "" # this should be a list + nationality: str = "" # we can later link it to country #this should be a list + workLocation: str = "" # this should be a list + worksFor: Organization = None # this should be a list def __post_init__(self): - # Loop through the fields + # Loop through the fields for field in fields(self): # If there is a default and the value of the field is none we can assign a value if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: setattr(self, field.name, field.default) + Organization.founder = List[Person] # Organization.funder = Union[Organization(), Person()] Organization.parentOrganization = Organization() + @dataclass class Author(Person): orcid: str = "" @@ -79,46 +83,45 @@ class Author(Person): cited_by_count: str = "" def __post_init__(self): - # Loop through the fields + # Loop through the fields for field in fields(self): # If there is a default and the value of the field is none we can assign a value if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: setattr(self, field.name, field.default) - - @dataclass class CreativeWork(thing): abstract: str = "" alternativeHeadline: str = "" author: List[Union[Organization, Person]] = field(default_factory=list) - citation: str = "" # this should actually reference to articles + citation: str = "" # this should actually reference to articles countryOfOrigin: str = "" creativeWorkStatus: str = "" dateCreated: str = "" dateModified: str = "" datePublished: str = "" - encoding_contentUrl: str = "" + encoding_contentUrl: str = "" encodingFormat: str = "" - funder: Union[Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 - funding: str = "" # we can change this to Grant + funder: Union[ + Organization, Person] = None # Organization | Person # we can use pipe operator for Union in Python >= 3.10 + funding: str = "" # we can change this to Grant genre: str = "" headline: str = "" inLanguage: List[str] = field(default_factory=list) keywords: List[str] = field(default_factory=list) - license: str = "" # url or license type - publication: str = "" #publication event + license: str = "" # url or license type + publication: str = "" # publication event publisher: Union[Organization, Person] = None sourceOrganization: Organization = None sponsor: Union[Organization, Person] = None text: str = "" - thumbnail: str = "" #ImageObject - thumbnailUrl: str = "" #url - version: str = "" + thumbnail: str = "" # ImageObject + thumbnailUrl: str = "" # url + version: str = "" def __post_init__(self): - # Loop through the fields + # Loop through the fields for field in fields(self): # If there is a default and the value of the field is none we can assign a value if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: @@ -126,7 +129,7 @@ def __post_init__(self): @dataclass -class Article(CreativeWork): +class Article(CreativeWork): articleBody: str = "" pageEnd: str = "" pageStart: str = "" @@ -134,7 +137,7 @@ class Article(CreativeWork): wordCount: str = "" def __post_init__(self): - # Loop through the fields + # Loop through the fields for field in fields(self): # If there is a default and the value of the field is none we can assign a value if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: @@ -142,37 +145,40 @@ def __post_init__(self): @dataclass -class Dataset(CreativeWork): - distribution: str = "" # can be DataDownload - issn: str = "" #it can be the unique ID of dataset +class Dataset(CreativeWork): + distribution: str = "" # can be DataDownload + issn: str = "" # it can be the unique ID of dataset def __post_init__(self): # Loop through the fields - for field in fields(self): - # If there is a default and the value of the field is none we can assign a value - if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: - setattr(self, field.name, field.default) + for field in fields(self): + # If there is a default and the value of the field is none we can assign a value + if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: + setattr(self, field.name, field.default) -#The 'Project' is a new addition to schema.org, and as of now, there are no defined properties for it +# The 'Project' is a new addition to schema.org, and as of now, there are no defined properties for it @dataclass -class Project(Organization): +class Project(Organization): dateStart: str = "" dateEnd: str = "" - dateLastModified : str = "" + dateLastModified: str = "" abstract: str = "" inLanguage: List[str] = field(default_factory=list) availableLanguages: List[str] = field(default_factory=list) objective: str = "" status: str = "" author: List[Union[Organization, Person]] = field(default_factory=list) + funder: List[Union[ + Organization, Person]] = field( + default_factory=list) # Organization | Person # we can use pipe operator for Union in Python >= 3.10 def __post_init__(self): # Loop through the fields - for field in fields(self): - # If there is a default and the value of the field is none we can assign a value - if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: - setattr(self, field.name, field.default) + for field in fields(self): + # If there is a default and the value of the field is none we can assign a value + if not isinstance(field.default, dataclasses._MISSING_TYPE) and getattr(self, field.name) is None: + setattr(self, field.name, field.default) @dataclass @@ -196,7 +202,6 @@ class Institute: description: str - @dataclass class Presentation: title: str @@ -258,7 +263,7 @@ class Lesson: description: str date: str - + @dataclass class Publisher: id: str @@ -285,7 +290,7 @@ class Funder: class Gesis: resource_type: str url: str - date: str + date: str title: str description: str authors: str @@ -313,4 +318,4 @@ class Gepris: title: str description: str date: str - applicant_or_leader:str + applicant_or_leader: str diff --git a/sources/openaire.py b/sources/openaire.py new file mode 100644 index 0000000..fa54378 --- /dev/null +++ b/sources/openaire.py @@ -0,0 +1,157 @@ +import requests +import utils +from objects import Dataset, Person, Author, Article, CreativeWork, Organization, Project +import logging + +logger = logging.getLogger('nfdi_search_engine') + + +def search(search_string: str, results): + """ Obtain the results from Openaire request and handles them accordingly. + + Args: + search_string: keyword(s) to search for + results: search answer formatted into different data types according to Openaire result_types + and mapped to schema.org types. + + Returns: + the results Object + """ + openaire_product_search(search_string, results) + openaire_project_search(search_string, results) + + logger.info(f"Got {len(results)} records from Openaire") + return results + + +def openaire_product_search(search_string, results): + api_url = 'https://api.openaire.eu/search/researchProducts' + response = requests.get(api_url, + params={"keywords": search_string, "format": "json", "size": 20}) + data = response.json() + logger.debug(f'Openaire product search response status code: {response.status_code}') + logger.debug(f'Openaire product search response headers: {response.headers}') + + # hits = data.get('response', {}).get('results', {}).get('result', []) + if response.status_code == 200: + try: + hits = data.get('response', {}).get('results', {}).get('result', []) + except AttributeError: + hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError + + for hit in hits: + print(hit) + pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:result', {}) + result_type = pro_result.get('resulttype', {}).get('@classid', 'other') + # check result type to create an Object of the right Class + if result_type == 'publication': + product = Article() + elif result_type == 'dataset': + product = Dataset() + else: + product = CreativeWork() + product.source = 'Openaire' + product.genre = result_type + date = pro_result.get('dateofacceptance', None) + if date: + product.datePublished = date['$'] + + # title can be dict or list. If list, there are 'main title' and 'alternate title' + if type(pro_result.get('title')) is dict: + product.name = pro_result.get('title', {}).get('$', '') + elif type(pro_result.get('title')) is list: + for item in pro_result.get('title'): + if item['@classid'] == 'main title': + product.name = item['$'] + + # description can be dict or list + if type(pro_result.get('description')) is dict: + product.description = utils.remove_html_tags(pro_result.get('description', {}).get('$', '')) + elif type(pro_result.get('description')) is list: + product.description = utils.remove_html_tags(pro_result.get('description')[0].get('$', '')) + else: + product.description = '' + + # Language can be set or "und" = Undetermined + product.inLanguage = '' if pro_result.get('language', {}).get('@classid', '') == 'und' else pro_result.get( + 'language', {}).get('@classid', '') + + # pid can be dict or list + if type(pro_result.get('pid')) is dict: + product.identifier = pro_result.get('pid', {}).get('$', '') + elif type(pro_result.get('pid')) is list: + product.identifier = pro_result.get('pid', {})[0].get('$', '') + else: + product.identifier = '' + + # Creators can be dict, list, None + # creators = pro_result.get('creator', {}) if pro_result.get('creator') is not None else {} + creators = pro_result.get('creator', None) + if type(creators) is dict: + creator = Author() + creator.type = 'Person' + creator.name = creators.get('$', '') + product.author.append(creator) + elif type(creators) is list: + for item in creators: + creator = Author() + creator.type = 'Person' + creator.name = item.get('$', '') + product.author.append(creator) + + # Check genre to add result to right category + if product.genre == 'publication': + results['publications'].append(product) + elif product.genre == 'dataset' or product.genre == 'software': + results['resources'].append(product) + else: + results['others'].append(product) + + +def openaire_project_search(search_string, results): + api_url = 'https://api.openaire.eu/search/projects' + response = requests.get(api_url, params={"name": search_string, "format": "json", "size": 20}) + data = response.json() + logger.debug(f'Openaire project search response status code: {response.status_code}') + logger.debug(f'Openaire project search response headers: {response.headers}') + + if response.status_code == 200: + try: + hits = data.get('response', {}).get('results', {}).get('result', []) + except AttributeError: + hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError + + for hit in hits: + pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:project', {}) + project = Project() + project.source = 'Openaire' + project.name = pro_result.get('title', {}).get('$', '') + project.dateStart = pro_result.get('startdate', {}).get('$', '') + project.dateEnd = pro_result.get('enddate', {}).get('$', '') + project.identifier = pro_result.get('callidentifier', {}).get('$', '') + + # fundingtree can be dict or list + # fundingtree = pro_result.get('fundingtree', {}) if pro_result.get('fundingtree') is not None else {} + fundingtree = pro_result.get('fundingtree', None) + if type(fundingtree) is dict: + orga = Organization() + orga.name = fundingtree.get('name', {}).get('$', '') + project.funder.append(orga) + elif type(fundingtree) is list: + for item in fundingtree: + orga = Organization() + orga.name = item.get('name', {}).get('$', '') + project.funder.append(orga) + + # "rels" can be None, dict, list + relations = pro_result.get('rels', {}).get('rel', {}) if pro_result.get('rels', {}) is not None else [] + if type(relations) is dict: + relations = [relations] + + # This need a review. Type 'Organization' ? + for rel in relations: + author_obj = Author() + author_obj.type = 'Organization' + author_obj.name = (rel.get('legalname', {}).get('$', '')) + project.author.append(author_obj) + results['others'].append(project) From ebcaea51cdbaec4de320ab340f9067cd47d96ad0 Mon Sep 17 00:00:00 2001 From: Jan Reineke Date: Mon, 28 Aug 2023 08:12:55 +0200 Subject: [PATCH 2/3] First version of Openaire search; with mapping to schema.org types --- main.py | 5 +++-- sources/openaire.py | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 3e9b430..57a1171 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ from flask import Flask, render_template, request, make_response import threading from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee, \ - openaire # eulg + codalab, eudat, openaire # eulg # import dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris # , eulg import details_page @@ -55,7 +55,8 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module - sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, openaire] + sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, eudat, codalab, + wikidata, openaire] # sources = [dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris] for source in sources: diff --git a/sources/openaire.py b/sources/openaire.py index fa54378..c25f2dd 100644 --- a/sources/openaire.py +++ b/sources/openaire.py @@ -40,7 +40,6 @@ def openaire_product_search(search_string, results): hits = [] # Set hits as an empty list if the 'get' operation fails due to AttributeError for hit in hits: - print(hit) pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:result', {}) result_type = pro_result.get('resulttype', {}).get('@classid', 'other') # check result type to create an Object of the right Class @@ -73,8 +72,8 @@ def openaire_product_search(search_string, results): product.description = '' # Language can be set or "und" = Undetermined - product.inLanguage = '' if pro_result.get('language', {}).get('@classid', '') == 'und' else pro_result.get( - 'language', {}).get('@classid', '') + product.inLanguage = [] if pro_result.get('language', {}).get('@classid', '') == 'und' else [pro_result.get( + 'language', {}).get('@classid', '')] # pid can be dict or list if type(pro_result.get('pid')) is dict: From 574ab654ff9670ad64540b5d438ffb3fb9725fe5 Mon Sep 17 00:00:00 2001 From: Jan Reineke Date: Tue, 29 Aug 2023 08:44:44 +0200 Subject: [PATCH 3/3] First version of Openaire search; with mapping to schema.org types --- main.py | 16 ++++++++++------ sources/openaire.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index d8df22e..caef188 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,8 @@ from objects import Article, Organization, Person, Dataset, Project from flask import Flask, render_template, request, make_response import threading -from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee, codalab, eudat # eulg +from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee, codalab, \ + eudat, openaire # eulg # import dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris # , eulg import details_page @@ -54,9 +55,10 @@ def search_results(): # add all the sources here in this list; for simplicity we should use the exact module name # ensure the main method which execute the search is named "search" in the module - sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, eudat, codalab, wikidata] + sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, eudat, codalab, + wikidata, openaire] # sources = [dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris] - + for source in sources: t = threading.Thread(target=source.search, args=(search_term, results,)) t.start() @@ -117,14 +119,13 @@ def resource_details(): response.set_cookie('search-session', request.cookies['session']) return response - @app.route('/researcher-details') def researcher_details(): response = make_response(render_template('researcher-details.html')) - - + + @app.route('/organization-details') def organization_details(): response = make_response(render_template('organization-details.html')) @@ -138,6 +139,7 @@ def organization_details(): return response + @app.route('/events-details') def events_details(): response = make_response(render_template('events-details.html')) @@ -151,6 +153,7 @@ def events_details(): return response + @app.route('/fundings-details') def fundings_details(): response = make_response(render_template('fundings-details.html')) @@ -163,6 +166,7 @@ def fundings_details(): return response + @app.route('/details', methods=['POST', 'GET']) def details(): if request.method == 'GET': diff --git a/sources/openaire.py b/sources/openaire.py index c25f2dd..e20ea89 100644 --- a/sources/openaire.py +++ b/sources/openaire.py @@ -1,6 +1,6 @@ import requests import utils -from objects import Dataset, Person, Author, Article, CreativeWork, Organization, Project +from objects import Dataset, Author, Article, CreativeWork, Organization, Project import logging logger = logging.getLogger('nfdi_search_engine')