feat: add dataset metadata to the KG (#558)

* feat: export Datasets to graph * chore: update restricted dot graph * fix: update dataset metadata * feat: adding DatasetFile from git * allow for creation of an Entity object without a commit in the repo * enable DatasetFile.from_revision * preserve source path in Entity if it is added from a Submodule
SwissDataScienceCenter · Jul 12, 2019 · fb443d7 · fb443d7
1 parent 7938ac4
commit fb443d7
Show file tree

Hide file tree

Showing 20 changed files with 329 additions and 181 deletions.
diff --git a/renku/api/datasets.py b/renku/api/datasets.py
@@ -28,7 +28,6 @@
 
 import attr
 import requests
-import yaml
 
 from renku import errors
 from renku._compat import Path
@@ -65,20 +64,26 @@ def datasets_from_commit(self, commit=None):
                 blob = tree / self.METADATA
             except KeyError:
                 continue
-
-            yield Dataset.from_jsonld(
-                yaml.safe_load(blob.data_stream.read()),
-                __reference__=Path(blob.path),
+            dataset = Dataset.from_yaml(
+                self.path / Path(blob.path), client=self
             )
+            dataset.commit = commit
+            yield dataset
 
     @property
     def datasets(self):
         """Return mapping from path to dataset."""
         result = {}
         for path in self.renku_datasets_path.rglob(self.METADATA):
-            result[path] = Dataset.from_yaml(path)
+            result[path] = self.get_dataset(path)
         return result
 
+    def get_dataset(self, path):
+        """Return a dataset from a given path."""
+        if not path.is_absolute():
+            path = self.path / path
+        return Dataset.from_yaml(path, client=self)
+
     def dataset_path(self, name):
         """Get dataset path from name."""
         from renku.models.refs import LinkReference
@@ -98,7 +103,7 @@ def load_dataset(self, name=None):
         if name:
             path = self.dataset_path(name)
             if path.exists():
-                dataset = Dataset.from_yaml(path)
+                dataset = self.get_dataset(path)
 
         return dataset
 
@@ -116,7 +121,9 @@ def with_dataset(self, name=None):
             path.parent.mkdir(parents=True, exist_ok=True)
 
             with with_reference(path):
-                dataset = Dataset(identifier=identifier, name=name)
+                dataset = Dataset(
+                    identifier=identifier, name=name, client=self
+                )
 
             if name:
                 LinkReference.create(client=self, name='datasets/' +
@@ -150,32 +157,38 @@ def add_data_to_dataset(
                     dataset, dataset_path, url, target, **kwargs
                 )
             else:
-                files = {}
+                files = []
                 for t in target:
-                    files.update(
+                    files.extend(
                         self._add_from_git(
                             dataset, dataset_path, url, t, **kwargs
                         )
                     )
         else:
             files = self._add_from_url(dataset, dataset_path, url, **kwargs)
 
-        ignored = self.find_ignored_paths(
-            *[
-                os.path.relpath(
-                    str(self.renku_datasets_path / dataset.uid / key),
-                    start=str(self.path),
-                ) for key in files.keys()
-            ]
-        )
+        ignored = self.find_ignored_paths(*(data['path']
+                                            for data in files)) or []
 
         if ignored:
             if force:
                 self.repo.git.add(*ignored, force=True)
             else:
                 raise errors.IgnoredFiles(ignored)
 
-        dataset.update_files(files.values())
+        # commit all new data
+        file_paths = {str(data['path']) for data in files if str(data['path'])}
+        self.repo.git.add(*(file_paths - set(ignored)))
+        self.repo.index.commit(
+            'renku dataset: commiting {} newly added files'.
+            format(len(file_paths) + len(ignored))
+        )
+
+        # Generate the DatasetFiles
+        dataset_files = []
+        for data in files:
+            dataset_files.append(DatasetFile.from_revision(self, **data))
+        dataset.update_files(dataset_files)
 
     def _add_from_url(self, dataset, path, url, link=False, **kwargs):
         """Process an add from url and return the location on disk."""
@@ -202,15 +215,16 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
 
             # if we have a directory, recurse
             if src.is_dir():
-                files = {}
+                files = []
                 dst.mkdir(parents=True, exist_ok=True)
                 for f in src.iterdir():
-                    files.update(
+                    files.extend(
                         self._add_from_url(
                             dataset,
                             dst,
                             f.absolute().as_posix(),
                             link=link,
+                            **kwargs
                         )
                     )
                 return files
@@ -243,17 +257,14 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
         dst.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))
 
         self.track_paths_in_storage(str(dst.relative_to(self.path)))
-        dataset_path = self.renku_datasets_path / dataset.name
-        result = os.path.relpath(str(dst), start=str(dataset_path))
-        return {
-            result:
-                DatasetFile(
-                    path=result,
-                    url=url,
-                    creator=dataset.creator,
-                    dataset=dataset.name,
-                )
-        }
+
+        return [{
+            'path': dst.relative_to(self.path),
+            'url': url,
+            'creator': dataset.creator,
+            'dataset': dataset.name,
+            'parent': self
+        }]
 
     def _add_from_git(self, dataset, path, url, target, **kwargs):
         """Process adding resources from another git repository.
@@ -280,21 +291,13 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
                 relative_url = None
 
             if relative_url:
-                result = str(
-                    os.path.relpath(
-                        str(relative_url),
-                        start=str(self.renku_datasets_path / dataset.uid),
-                    )
-                )
-                return {
-                    result:
-                        DatasetFile(
-                            path=result,
-                            url=url,
-                            creator=dataset.creator,
-                            dataset=dataset.name,
-                        )
-                }
+                return [{
+                    'path': url,
+                    'url': url,
+                    'creator': dataset.creator,
+                    'dataset': dataset.name,
+                    'parent': self
+                }]
 
             warnings.warn('Importing local git repository, use HTTPS')
             # determine where is the base repo path
@@ -355,12 +358,12 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
 
         # if we have a directory, recurse
         if src.is_dir():
-            files = {}
+            files = []
             dst.mkdir(parents=True, exist_ok=True)
             # FIXME get all files from submodule index
             for f in src.iterdir():
                 try:
-                    files.update(
+                    files.extend(
                         self._add_from_git(
                             dataset,
                             path,
@@ -386,23 +389,18 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
             if creator not in creators:
                 creators.append(creator)
 
-        dataset_path = self.renku_datasets_path / dataset.name
-        result = os.path.relpath(str(dst), start=str(dataset_path))
-
         if u.scheme in ('', 'file'):
             url = None
         else:
             url = '{}/{}'.format(url, target)
 
-        return {
-            result:
-                DatasetFile(
-                    path=result,
-                    url=url,
-                    creator=creators,
-                    dataset=dataset.name,  # TODO detect original dataset
-                )
-        }
+        return [{
+            'path': dst.relative_to(self.path),
+            'url': url,
+            'creator': creators,
+            'dataset': dataset.name,
+            'parent': self
+        }]
 
     def get_relative_url(self, url):
         """Determine if the repo url should be relative."""

diff --git a/renku/cli/_checks/files_in_datasets.py b/renku/cli/_checks/files_in_datasets.py
@@ -17,8 +17,8 @@
 # limitations under the License.
 """Check location of files in datasets."""
 
-import os
 from collections import defaultdict
+from pathlib import Path
 
 import click
 
@@ -31,13 +31,11 @@ def check_missing_files(client):
 
     for path, dataset in client.datasets.items():
         for file_ in dataset.files:
-            filepath = (path.parent / file_.path)
+            filepath = Path(file_.path)
             if not filepath.exists():
                 missing[str(
                     path.parent.relative_to(client.renku_datasets_path)
-                )].append(
-                    os.path.normpath(str(filepath.relative_to(client.path)))
-                )
+                )].append(str(filepath))
 
     if not missing:
         return True

diff --git a/renku/cli/_format/graph.py b/renku/cli/_format/graph.py
@@ -59,9 +59,11 @@ def dot(graph, simple=True, debug=False, landscape=False):
     )
 
     g.bind('prov', 'http://www.w3.org/ns/prov#')
+    g.bind('foaf', 'http://xmlns.com/foaf/0.1/')
     g.bind('wfdesc', 'http://purl.org/wf4ever/wfdesc#')
     g.bind('wf', 'http://www.w3.org/2005/01/wf/flow#')
     g.bind('wfprov', 'http://purl.org/wf4ever/wfprov#')
+    g.bind('schema', 'http://schema.org/')
 
     if debug:
         rdf2dot(g, sys.stdout)
@@ -230,20 +232,33 @@ def color(p):
         """Choose node color."""
         return 'BLACK'
 
-    for s, p, o in g:
+    # filter out nodes and edges created for directories
+    sparql = """
+    SELECT ?s ?p ?o
+    WHERE {
+        ?s ?p ?o
+        MINUS {
+            ?s rdf:type prov:Collection.
+        }
+        MINUS {
+            VALUES ?exclude { prov:wasInformedBy prov:influenced rdf:label }
+            ?s ?exclude ?o.
+        }
+    }
+    """
+
+    for s, p, o in g.query(sparql):
         sn = node(s)
         if p == rdflib.RDFS.label:
             continue
-
         # inject the type predicate into the node itself
         if p == rdflib.RDF.type:
             types[sn].add((qname(p, g), cgi.escape(o)))
             continue
-        if p == rdflib.term.URIRef('http://purl.org/dc/terms/isPartOf'):
+        # add the project membership to the node
+        if p == rdflib.term.URIRef('schema:isPartOf'):
             fields[sn].add((qname(p, g), cgi.escape(o)))
             continue
-        if p == rdflib.term.URIRef('http://www.w3.org/ns/prov#wasInformedBy'):
-            continue
 
         if isinstance(o, (rdflib.URIRef, rdflib.BNode)):
             on = node(o)

diff --git a/renku/cli/_providers/zenodo.py b/renku/cli/_providers/zenodo.py
@@ -228,11 +228,11 @@ def get_files(self):
 
         return [ZenodoFileSerializer(**file_) for file_ in self.files]
 
-    def as_dataset(self):
+    def as_dataset(self, client):
         """Deserialize `ZenodoRecordSerializer` to `Dataset`."""
         files = self.get_files()
         metadata = self.get_jsonld()
-        dataset = Dataset.from_jsonld(metadata)
+        dataset = Dataset.from_jsonld(metadata, client=client)
 
         serialized_files = []
         for file_ in files:

diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py
@@ -246,7 +246,7 @@ def edit(client, id):
         )
 
         edited = yaml.safe_load(metadata_edited)
-        updated_ = Dataset(**edited)
+        updated_ = Dataset(client=client, **edited)
 
         dataset_.update_metadata(updated_)
         dataset_.to_yaml()
@@ -303,7 +303,7 @@ def add_to_dataset(
                 for file_ in with_metadata.files:
                     for added_ in dataset.files:
 
-                        if file_.filename.endswith(added_.path.name):
+                        if added_.path.endswith(file_.filename):
                             if isinstance(file_.url, ParseResult):
                                 file_.url = file_.url.geturl()
 
@@ -526,7 +526,7 @@ def import_(ctx, client, uri, name, extract):
     try:
 
         record = provider.find_record(uri)
-        dataset_ = record.as_dataset()
+        dataset_ = record.as_dataset(client)
         files_ = dataset_.files
 
         click.echo(

diff --git a/renku/cli/doctor.py b/renku/cli/doctor.py
@@ -40,8 +40,8 @@ def doctor(ctx, client):
     from . import _checks
 
     is_ok = True
-    for attr in _checks.__all__:
-        is_ok &= getattr(_checks, attr)(client)
+    for check in _checks.__all__:
+        is_ok &= getattr(_checks, check)(client)
 
     if is_ok:
         click.secho('Everything seems to be ok.', fg='green')

diff --git a/renku/cli/migrate.py b/renku/cli/migrate.py
@@ -51,8 +51,7 @@ def datasets(ctx, client):
     from ._checks.location_datasets import _dataset_metadata_pre_0_3_4
 
     for old_path in _dataset_metadata_pre_0_3_4(client):
-        with old_path.open('r') as fp:
-            dataset = Dataset.from_jsonld(yaml.safe_load(fp))
+        dataset = Dataset.from_yaml(old_path, client=client)
 
         name = str(old_path.parent.relative_to(client.path / 'data'))
         new_path = (client.renku_datasets_path / dataset.uid / client.METADATA)

diff --git a/renku/cli/move.py b/renku/cli/move.py
@@ -81,14 +81,10 @@ def fmt_dst(path):
             renames = {}
 
             for file_ in dataset.files:
-                filepath = fmt_path(
-                    os.path.normpath(str(path.parent / file_.path))
-                )
+                filepath = fmt_path(file_.path)
 
                 if filepath in files:
-                    renames[file_.path] = os.path.relpath(
-                        destinations[filepath], start=str(path.parent)
-                    )
+                    renames[file_.path] = destinations[filepath]
 
             if renames:
                 dataset = dataset.rename_files(

diff --git a/renku/cli/remove.py b/renku/cli/remove.py
@@ -61,7 +61,7 @@ def fmt_path(path):
             remove = []
             for file_ in dataset.files:
                 key = file_.path
-                filepath = fmt_path(file_.full_path)
+                filepath = fmt_path(file_.path)
                 if filepath in files:
                     remove.append(key)