Merge pull request #156 from semantic-systems/feature/openaire-search

Feature/openaire search
semantic-systems · Aug 31, 2023 · f61c080 · f61c080
2 parents bec5930 + 574ab65
commit f61c080
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 7 deletions.
diff --git a/main.py b/main.py
@@ -6,7 +6,8 @@
 from objects import Article, Organization, Person, Dataset, Project
 from flask import Flask, render_template, request, make_response
 import threading
-from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee, codalab, eudat  # eulg
+from sources import dblp, zenodo, openalex, resodate, oersi, wikidata, cordis, gesis, orcid, gepris, ieee, codalab, \
+    eudat, openaire  # eulg
 # import dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris # , eulg
 import details_page
 
@@ -54,9 +55,10 @@ def search_results():
 
         # add all the sources here in this list; for simplicity we should use the exact module name
         # ensure the main method which execute the search is named "search" in the module 
-        sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, eudat, codalab, wikidata]
+        sources = [resodate, oersi, openalex, orcid, dblp, zenodo, gesis, ieee, cordis, gepris, eudat, codalab,
+                   wikidata, openaire]
         # sources = [dblp, zenodo, openalex, resodate, wikidata, cordis, gesis, orcid, gepris]
-        
+
         for source in sources:
             t = threading.Thread(target=source.search, args=(search_term, results,))
             t.start()
@@ -117,14 +119,13 @@ def resource_details():
             response.set_cookie('search-session', request.cookies['session'])
 
     return response
-
 
 
 @app.route('/researcher-details')
 def researcher_details():
     response = make_response(render_template('researcher-details.html'))
-    
-    
+
+
 @app.route('/organization-details')
 def organization_details():
     response = make_response(render_template('organization-details.html'))
@@ -138,6 +139,7 @@ def organization_details():
 
     return response
 
+
 @app.route('/events-details')
 def events_details():
     response = make_response(render_template('events-details.html'))
@@ -151,6 +153,7 @@ def events_details():
 
     return response
 
+
 @app.route('/fundings-details')
 def fundings_details():
     response = make_response(render_template('fundings-details.html'))
@@ -163,6 +166,7 @@ def fundings_details():
 
     return response
 
+
 @app.route('/details', methods=['POST', 'GET'])
 def details():
     if request.method == 'GET':

diff --git a/objects.py b/objects.py
@@ -144,7 +144,7 @@ def __post_init__(self):
 @dataclass
 class Dataset(CreativeWork): 
     distribution: str = ""
-    issn: str = "" 
+    issn: str = ""
 
     def __post_init__(self):
         # Loop through the fields
@@ -166,6 +166,9 @@ class Project(Organization):
     objective: str = ""
     status: str = ""
     author: List[Union[Organization, Person]] = field(default_factory=list)
+    funder: List[Union[
+        Organization, Person]] = field(
+        default_factory=list)  # Organization | Person # we can use pipe operator for Union in Python >= 3.10
 
 
     def __post_init__(self):

diff --git a/sources/openaire.py b/sources/openaire.py
@@ -0,0 +1,156 @@
+import requests
+import utils
+from objects import Dataset, Author, Article, CreativeWork, Organization, Project
+import logging
+
+logger = logging.getLogger('nfdi_search_engine')
+
+
+def search(search_string: str, results):
+    """ Obtain the results from Openaire request and handles them accordingly.
+
+          Args:
+              search_string: keyword(s) to search for
+              results: search answer formatted into different data types according to Openaire result_types
+              and mapped to schema.org types.
+
+          Returns:
+                the results Object
+          """
+    openaire_product_search(search_string, results)
+    openaire_project_search(search_string, results)
+
+    logger.info(f"Got {len(results)} records from Openaire")
+    return results
+
+
+def openaire_product_search(search_string, results):
+    api_url = 'https://api.openaire.eu/search/researchProducts'
+    response = requests.get(api_url,
+                            params={"keywords": search_string, "format": "json", "size": 20})
+    data = response.json()
+    logger.debug(f'Openaire product search response status code: {response.status_code}')
+    logger.debug(f'Openaire product search response headers: {response.headers}')
+
+    # hits = data.get('response', {}).get('results', {}).get('result', [])
+    if response.status_code == 200:
+        try:
+            hits = data.get('response', {}).get('results', {}).get('result', [])
+        except AttributeError:
+            hits = []  # Set hits as an empty list if the 'get' operation fails due to AttributeError
+
+        for hit in hits:
+            pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:result', {})
+            result_type = pro_result.get('resulttype', {}).get('@classid', 'other')
+            # check result type to create an Object of the right Class
+            if result_type == 'publication':
+                product = Article()
+            elif result_type == 'dataset':
+                product = Dataset()
+            else:
+                product = CreativeWork()
+            product.source = 'Openaire'
+            product.genre = result_type
+            date = pro_result.get('dateofacceptance', None)
+            if date:
+                product.datePublished = date['$']
+
+            # title can be dict or list. If list, there are 'main title' and 'alternate title'
+            if type(pro_result.get('title')) is dict:
+                product.name = pro_result.get('title', {}).get('$', '')
+            elif type(pro_result.get('title')) is list:
+                for item in pro_result.get('title'):
+                    if item['@classid'] == 'main title':
+                        product.name = item['$']
+
+            # description can be dict or list
+            if type(pro_result.get('description')) is dict:
+                product.description = utils.remove_html_tags(pro_result.get('description', {}).get('$', ''))
+            elif type(pro_result.get('description')) is list:
+                product.description = utils.remove_html_tags(pro_result.get('description')[0].get('$', ''))
+            else:
+                product.description = ''
+
+            # Language can be set or "und" = Undetermined
+            product.inLanguage = [] if pro_result.get('language', {}).get('@classid', '') == 'und' else [pro_result.get(
+                'language', {}).get('@classid', '')]
+
+            # pid can be dict or list
+            if type(pro_result.get('pid')) is dict:
+                product.identifier = pro_result.get('pid', {}).get('$', '')
+            elif type(pro_result.get('pid')) is list:
+                product.identifier = pro_result.get('pid', {})[0].get('$', '')
+            else:
+                product.identifier = ''
+
+            # Creators can be dict, list, None
+            # creators = pro_result.get('creator', {}) if pro_result.get('creator') is not None else {}
+            creators = pro_result.get('creator', None)
+            if type(creators) is dict:
+                creator = Author()
+                creator.type = 'Person'
+                creator.name = creators.get('$', '')
+                product.author.append(creator)
+            elif type(creators) is list:
+                for item in creators:
+                    creator = Author()
+                    creator.type = 'Person'
+                    creator.name = item.get('$', '')
+                    product.author.append(creator)
+
+            # Check genre to add result to right category
+            if product.genre == 'publication':
+                results['publications'].append(product)
+            elif product.genre == 'dataset' or product.genre == 'software':
+                results['resources'].append(product)
+            else:
+                results['others'].append(product)
+
+
+def openaire_project_search(search_string, results):
+    api_url = 'https://api.openaire.eu/search/projects'
+    response = requests.get(api_url, params={"name": search_string, "format": "json", "size": 20})
+    data = response.json()
+    logger.debug(f'Openaire project search response status code: {response.status_code}')
+    logger.debug(f'Openaire project search response headers: {response.headers}')
+
+    if response.status_code == 200:
+        try:
+            hits = data.get('response', {}).get('results', {}).get('result', [])
+        except AttributeError:
+            hits = []  # Set hits as an empty list if the 'get' operation fails due to AttributeError
+
+        for hit in hits:
+            pro_result = hit.get('metadata', {}).get('oaf:entity', {}).get('oaf:project', {})
+            project = Project()
+            project.source = 'Openaire'
+            project.name = pro_result.get('title', {}).get('$', '')
+            project.dateStart = pro_result.get('startdate', {}).get('$', '')
+            project.dateEnd = pro_result.get('enddate', {}).get('$', '')
+            project.identifier = pro_result.get('callidentifier', {}).get('$', '')
+
+            # fundingtree can be dict or list
+            # fundingtree = pro_result.get('fundingtree', {}) if pro_result.get('fundingtree') is not None else {}
+            fundingtree = pro_result.get('fundingtree', None)
+            if type(fundingtree) is dict:
+                orga = Organization()
+                orga.name = fundingtree.get('name', {}).get('$', '')
+                project.funder.append(orga)
+            elif type(fundingtree) is list:
+                for item in fundingtree:
+                    orga = Organization()
+                    orga.name = item.get('name', {}).get('$', '')
+                    project.funder.append(orga)
+
+            # "rels" can be None, dict, list
+            relations = pro_result.get('rels', {}).get('rel', {}) if pro_result.get('rels', {}) is not None else []
+            if type(relations) is dict:
+                relations = [relations]
+
+            # This need a review. Type 'Organization' ?
+            for rel in relations:
+                author_obj = Author()
+                author_obj.type = 'Organization'
+                author_obj.name = (rel.get('legalname', {}).get('$', ''))
+                project.author.append(author_obj)
+            results['others'].append(project)