From 1e7fe23bac4c257c5465524ec9a74d32e54465ae Mon Sep 17 00:00:00 2001 From: Paul Frederiks Date: Fri, 29 Sep 2017 14:05:18 +0200 Subject: [PATCH] Adapt to new yoda-metadata schema --- moai/yoda.py | 129 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 41 deletions(-) diff --git a/moai/yoda.py b/moai/yoda.py index f538cca..ab70d0c 100644 --- a/moai/yoda.py +++ b/moai/yoda.py @@ -1,7 +1,7 @@ from lxml import etree from datetime import datetime, timedelta -from moai.utils import XPath +from moai.utils import XPath, get_moai_log class YodaContent(object): @@ -9,52 +9,99 @@ def __init__(self, provider): self.provider = provider self.id = None self.modified = None - self.deleted = None - self.sets = None - self.metadata = None + self.deleted = False + self.sets = dict() + self.metadata = dict() def update(self, path): - doc = etree.parse(path) + log = get_moai_log() + try: + doc = etree.parse(path) + except etree.ParseError: + log.warning("Failed to parse %s".format(path)) + return + xpath = XPath(doc, nsmap={}) self.root = doc.getroot() - - id = xpath.string('//Project_ID') + + id = xpath.string('//Persistent_Identifier_Datapackage') + if not id: + log.warning("Missing Persistent Identifier of Datapackage in %s".format(path)) + return + self.id = 'oai:%s' % id - self.modified = datetime.now() - timedelta(days=1) - self.deleted = True + + self.metadata['identifier'] = [id] + + last_modified = xpath.string('//Last_Modified_Date') + if not last_modified: + log.warning("Missing Last Modified Time in %s".format(path)) + self.modified = datetime.now() - timedelta(days=1) + else: + self.modified = datetime.strptime(last_modified, "%Y-%M-%d") author_data = [] + creators = xpath.strings('//Creator/Name') + if creators: + self.metadata['creator'] = creators + for creator in creators: + author_data.append({u"name": creator, u"role": [u"auth"]}) + + contributors = xpath.strings('//Contributor/Name') + if contributors: + self.metadata['contributor'] = contributors + for contributor in contributors: + author_data.append({u"name": contributor, u"role": [u"cont"]}) + + self.metadata["author_data"]= author_data + + title = xpath.string('//Title') + if title: + self.metadata['title'] = [title] + + description = xpath.string('//Description') + if description: + self.metadata['description'] = [description] + + language = xpath.string('//Language') + if language: + self.metadata['language'] = [language] + + datesinxml = [xpath.string('//Publication_Date'), + xpath.string('//Embargo_End_Date')] + + dates = [d for d in datesinxml if d] + if dates: + self.metadata['date'] = dates + + rightsinxml = [xpath.string('//License'), + xpath.string('//License/Properties/URL')] + + rights = [r for r in rightsinxml if r] + if rights: + self.metadata['rights'] = rights + + subjectinxml = xpath.strings('//Discipline') + xpath.strings('//Tag') + subject = [s for s in subjectinxml if s] + if subject: + self.metadata['subject'] = subject + + locations = xpath.strings('//Location_Covered') + perioddates = [xpath.string('//Start_Period'), xpath.string('//End_Period')] + period = "/".join([d for d in perioddates if d]) + if period: + coverage = locations + [period] + else: + coverage = locations + if coverage: + self.metadata['coverage'] = coverage + + relations = xpath.strings('//Persistent_Identifier') + if relations: + self.metadata['relation'] = relations - # Add creator of dataset. - author_data.append({'name': [xpath.string('//Creator')], - 'role': [u'aut']}) - - # Add all contributors to dataset. - for num, el in enumerate(xpath('//Contributor'), 1): - contributor = [xpath.string('//Contributor[%d]' % num)] - author_data.append({'name': contributor, - 'role': [u'aut']}) - - # Add metadata of dataset. - self.metadata = {'identifier': [id], - 'title': [xpath.string('//Project_Title')], - 'subject': [xpath.string('//Project_Description')], - 'description': [xpath.string('//Project_Description')], - 'creator': [d['name'][0] for d in author_data], - 'author_data': author_data, - 'language': [xpath.string('//Language_dataset')], - 'date': [xpath.string('//Embargo')]} - - # Clean dataset type. - type = xpath.string('//Dataset_Type') - type = type.replace(" ", "_") - - # Specify dataset. - self.sets = {type: - {u'name':xpath.string('//Dataset_Title'), - u'description':xpath.string('//Dataset_Description')}} - - published = xpath.string('//Publish_Metadata') - if published == 'Yes': - self.deleted = False + self.sets[u'yoda'] = { + u'name': u'YoDa', + u'description': u'share-collaborate environment for research data' + }