Skip to content

Commit

Permalink
feat: add dataset metadata to the KG (#558)
Browse files Browse the repository at this point in the history
* feat: export Datasets to graph

* chore: update restricted dot graph

* fix: update dataset metadata

* feat: adding DatasetFile from git

* allow for creation of an Entity object without a commit in the repo
* enable DatasetFile.from_revision
* preserve source path in Entity if it is added from a Submodule
  • Loading branch information
rokroskar authored and jsam committed Jul 12, 2019
1 parent 7938ac4 commit fb443d7
Show file tree
Hide file tree
Showing 20 changed files with 329 additions and 181 deletions.
120 changes: 59 additions & 61 deletions renku/api/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

import attr
import requests
import yaml

from renku import errors
from renku._compat import Path
Expand Down Expand Up @@ -65,20 +64,26 @@ def datasets_from_commit(self, commit=None):
blob = tree / self.METADATA
except KeyError:
continue

yield Dataset.from_jsonld(
yaml.safe_load(blob.data_stream.read()),
__reference__=Path(blob.path),
dataset = Dataset.from_yaml(
self.path / Path(blob.path), client=self
)
dataset.commit = commit
yield dataset

@property
def datasets(self):
"""Return mapping from path to dataset."""
result = {}
for path in self.renku_datasets_path.rglob(self.METADATA):
result[path] = Dataset.from_yaml(path)
result[path] = self.get_dataset(path)
return result

def get_dataset(self, path):
"""Return a dataset from a given path."""
if not path.is_absolute():
path = self.path / path
return Dataset.from_yaml(path, client=self)

def dataset_path(self, name):
"""Get dataset path from name."""
from renku.models.refs import LinkReference
Expand All @@ -98,7 +103,7 @@ def load_dataset(self, name=None):
if name:
path = self.dataset_path(name)
if path.exists():
dataset = Dataset.from_yaml(path)
dataset = self.get_dataset(path)

return dataset

Expand All @@ -116,7 +121,9 @@ def with_dataset(self, name=None):
path.parent.mkdir(parents=True, exist_ok=True)

with with_reference(path):
dataset = Dataset(identifier=identifier, name=name)
dataset = Dataset(
identifier=identifier, name=name, client=self
)

if name:
LinkReference.create(client=self, name='datasets/' +
Expand Down Expand Up @@ -150,32 +157,38 @@ def add_data_to_dataset(
dataset, dataset_path, url, target, **kwargs
)
else:
files = {}
files = []
for t in target:
files.update(
files.extend(
self._add_from_git(
dataset, dataset_path, url, t, **kwargs
)
)
else:
files = self._add_from_url(dataset, dataset_path, url, **kwargs)

ignored = self.find_ignored_paths(
*[
os.path.relpath(
str(self.renku_datasets_path / dataset.uid / key),
start=str(self.path),
) for key in files.keys()
]
)
ignored = self.find_ignored_paths(*(data['path']
for data in files)) or []

if ignored:
if force:
self.repo.git.add(*ignored, force=True)
else:
raise errors.IgnoredFiles(ignored)

dataset.update_files(files.values())
# commit all new data
file_paths = {str(data['path']) for data in files if str(data['path'])}
self.repo.git.add(*(file_paths - set(ignored)))
self.repo.index.commit(
'renku dataset: commiting {} newly added files'.
format(len(file_paths) + len(ignored))
)

# Generate the DatasetFiles
dataset_files = []
for data in files:
dataset_files.append(DatasetFile.from_revision(self, **data))
dataset.update_files(dataset_files)

def _add_from_url(self, dataset, path, url, link=False, **kwargs):
"""Process an add from url and return the location on disk."""
Expand All @@ -202,15 +215,16 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):

# if we have a directory, recurse
if src.is_dir():
files = {}
files = []
dst.mkdir(parents=True, exist_ok=True)
for f in src.iterdir():
files.update(
files.extend(
self._add_from_url(
dataset,
dst,
f.absolute().as_posix(),
link=link,
**kwargs
)
)
return files
Expand Down Expand Up @@ -243,17 +257,14 @@ def _add_from_url(self, dataset, path, url, link=False, **kwargs):
dst.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))

self.track_paths_in_storage(str(dst.relative_to(self.path)))
dataset_path = self.renku_datasets_path / dataset.name
result = os.path.relpath(str(dst), start=str(dataset_path))
return {
result:
DatasetFile(
path=result,
url=url,
creator=dataset.creator,
dataset=dataset.name,
)
}

return [{
'path': dst.relative_to(self.path),
'url': url,
'creator': dataset.creator,
'dataset': dataset.name,
'parent': self
}]

def _add_from_git(self, dataset, path, url, target, **kwargs):
"""Process adding resources from another git repository.
Expand All @@ -280,21 +291,13 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
relative_url = None

if relative_url:
result = str(
os.path.relpath(
str(relative_url),
start=str(self.renku_datasets_path / dataset.uid),
)
)
return {
result:
DatasetFile(
path=result,
url=url,
creator=dataset.creator,
dataset=dataset.name,
)
}
return [{
'path': url,
'url': url,
'creator': dataset.creator,
'dataset': dataset.name,
'parent': self
}]

warnings.warn('Importing local git repository, use HTTPS')
# determine where is the base repo path
Expand Down Expand Up @@ -355,12 +358,12 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):

# if we have a directory, recurse
if src.is_dir():
files = {}
files = []
dst.mkdir(parents=True, exist_ok=True)
# FIXME get all files from submodule index
for f in src.iterdir():
try:
files.update(
files.extend(
self._add_from_git(
dataset,
path,
Expand All @@ -386,23 +389,18 @@ def _add_from_git(self, dataset, path, url, target, **kwargs):
if creator not in creators:
creators.append(creator)

dataset_path = self.renku_datasets_path / dataset.name
result = os.path.relpath(str(dst), start=str(dataset_path))

if u.scheme in ('', 'file'):
url = None
else:
url = '{}/{}'.format(url, target)

return {
result:
DatasetFile(
path=result,
url=url,
creator=creators,
dataset=dataset.name, # TODO detect original dataset
)
}
return [{
'path': dst.relative_to(self.path),
'url': url,
'creator': creators,
'dataset': dataset.name,
'parent': self
}]

def get_relative_url(self, url):
"""Determine if the repo url should be relative."""
Expand Down
8 changes: 3 additions & 5 deletions renku/cli/_checks/files_in_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
# limitations under the License.
"""Check location of files in datasets."""

import os
from collections import defaultdict
from pathlib import Path

import click

Expand All @@ -31,13 +31,11 @@ def check_missing_files(client):

for path, dataset in client.datasets.items():
for file_ in dataset.files:
filepath = (path.parent / file_.path)
filepath = Path(file_.path)
if not filepath.exists():
missing[str(
path.parent.relative_to(client.renku_datasets_path)
)].append(
os.path.normpath(str(filepath.relative_to(client.path)))
)
)].append(str(filepath))

if not missing:
return True
Expand Down
25 changes: 20 additions & 5 deletions renku/cli/_format/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ def dot(graph, simple=True, debug=False, landscape=False):
)

g.bind('prov', 'http://www.w3.org/ns/prov#')
g.bind('foaf', 'http://xmlns.com/foaf/0.1/')
g.bind('wfdesc', 'http://purl.org/wf4ever/wfdesc#')
g.bind('wf', 'http://www.w3.org/2005/01/wf/flow#')
g.bind('wfprov', 'http://purl.org/wf4ever/wfprov#')
g.bind('schema', 'http://schema.org/')

if debug:
rdf2dot(g, sys.stdout)
Expand Down Expand Up @@ -230,20 +232,33 @@ def color(p):
"""Choose node color."""
return 'BLACK'

for s, p, o in g:
# filter out nodes and edges created for directories
sparql = """
SELECT ?s ?p ?o
WHERE {
?s ?p ?o
MINUS {
?s rdf:type prov:Collection.
}
MINUS {
VALUES ?exclude { prov:wasInformedBy prov:influenced rdf:label }
?s ?exclude ?o.
}
}
"""

for s, p, o in g.query(sparql):
sn = node(s)
if p == rdflib.RDFS.label:
continue

# inject the type predicate into the node itself
if p == rdflib.RDF.type:
types[sn].add((qname(p, g), cgi.escape(o)))
continue
if p == rdflib.term.URIRef('http://purl.org/dc/terms/isPartOf'):
# add the project membership to the node
if p == rdflib.term.URIRef('schema:isPartOf'):
fields[sn].add((qname(p, g), cgi.escape(o)))
continue
if p == rdflib.term.URIRef('http://www.w3.org/ns/prov#wasInformedBy'):
continue

if isinstance(o, (rdflib.URIRef, rdflib.BNode)):
on = node(o)
Expand Down
4 changes: 2 additions & 2 deletions renku/cli/_providers/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,11 @@ def get_files(self):

return [ZenodoFileSerializer(**file_) for file_ in self.files]

def as_dataset(self):
def as_dataset(self, client):
"""Deserialize `ZenodoRecordSerializer` to `Dataset`."""
files = self.get_files()
metadata = self.get_jsonld()
dataset = Dataset.from_jsonld(metadata)
dataset = Dataset.from_jsonld(metadata, client=client)

serialized_files = []
for file_ in files:
Expand Down
6 changes: 3 additions & 3 deletions renku/cli/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def edit(client, id):
)

edited = yaml.safe_load(metadata_edited)
updated_ = Dataset(**edited)
updated_ = Dataset(client=client, **edited)

dataset_.update_metadata(updated_)
dataset_.to_yaml()
Expand Down Expand Up @@ -303,7 +303,7 @@ def add_to_dataset(
for file_ in with_metadata.files:
for added_ in dataset.files:

if file_.filename.endswith(added_.path.name):
if added_.path.endswith(file_.filename):
if isinstance(file_.url, ParseResult):
file_.url = file_.url.geturl()

Expand Down Expand Up @@ -526,7 +526,7 @@ def import_(ctx, client, uri, name, extract):
try:

record = provider.find_record(uri)
dataset_ = record.as_dataset()
dataset_ = record.as_dataset(client)
files_ = dataset_.files

click.echo(
Expand Down
4 changes: 2 additions & 2 deletions renku/cli/doctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def doctor(ctx, client):
from . import _checks

is_ok = True
for attr in _checks.__all__:
is_ok &= getattr(_checks, attr)(client)
for check in _checks.__all__:
is_ok &= getattr(_checks, check)(client)

if is_ok:
click.secho('Everything seems to be ok.', fg='green')
Expand Down
3 changes: 1 addition & 2 deletions renku/cli/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def datasets(ctx, client):
from ._checks.location_datasets import _dataset_metadata_pre_0_3_4

for old_path in _dataset_metadata_pre_0_3_4(client):
with old_path.open('r') as fp:
dataset = Dataset.from_jsonld(yaml.safe_load(fp))
dataset = Dataset.from_yaml(old_path, client=client)

name = str(old_path.parent.relative_to(client.path / 'data'))
new_path = (client.renku_datasets_path / dataset.uid / client.METADATA)
Expand Down
8 changes: 2 additions & 6 deletions renku/cli/move.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,10 @@ def fmt_dst(path):
renames = {}

for file_ in dataset.files:
filepath = fmt_path(
os.path.normpath(str(path.parent / file_.path))
)
filepath = fmt_path(file_.path)

if filepath in files:
renames[file_.path] = os.path.relpath(
destinations[filepath], start=str(path.parent)
)
renames[file_.path] = destinations[filepath]

if renames:
dataset = dataset.rename_files(
Expand Down
2 changes: 1 addition & 1 deletion renku/cli/remove.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def fmt_path(path):
remove = []
for file_ in dataset.files:
key = file_.path
filepath = fmt_path(file_.full_path)
filepath = fmt_path(file_.path)
if filepath in files:
remove.append(key)

Expand Down
Loading

0 comments on commit fb443d7

Please sign in to comment.