Skip to content

Commit

Permalink
Added validation layer to SyncedCollection (#378)
Browse files Browse the repository at this point in the history
SyncedCollections can now arbitrary validators to input data to ensure that data conforms to a desired specification. This layer can be employed to ensure, for instance, that all inputs are valid JSON.
  • Loading branch information
vishav1771 authored Sep 9, 2020
1 parent 1906db8 commit ee015af
Show file tree
Hide file tree
Showing 11 changed files with 404 additions and 78 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ omit =

[tool:pytest]
filterwarnings =
ignore: .*[The indexing module | get_statepoint] is deprecated.*: DeprecationWarning
ignore: .*[The indexing module | get_statepoint | Use of.+as key] is deprecated.*: DeprecationWarning

[bumpversion:file:setup.py]

Expand Down
40 changes: 34 additions & 6 deletions signac/core/collection_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,39 @@
import json
import errno
import uuid
import warnings

from .synced_collection import SyncedCollection
from .syncedattrdict import SyncedAttrDict
from .synced_list import SyncedList
from .validators import json_format_validator


def _convert_key_to_str(data):
"""Recursively convert non-string keys to strings for (potentially nested) input collections.
This retains compatibility with auto-casting keys to strings, and will be
removed in signac 2.0.
Input collections must be of "base type" (dict or list).
"""
if isinstance(data, dict):
def _str_key(key):
if not isinstance(key, str):
warnings.warn(f"Use of {type(key).__name__} as key is deprecated "
"and will be removed in version 2.0",
DeprecationWarning)
key = str(key)
return key
return {_str_key(key): _convert_key_to_str(value) for key, value in data.items()}
elif isinstance(data, list):
return [_convert_key_to_str(value) for value in data]
return data


class JSONCollection(SyncedCollection):
"""Implement sync and load using a JSON back end."""

backend = __name__ # type: ignore
_backend = __name__ # type: ignore

def __init__(self, filename=None, write_concern=False, **kwargs):
self._filename = None if filename is None else os.path.realpath(filename)
Expand All @@ -29,7 +52,7 @@ def __init__(self, filename=None, write_concern=False, **kwargs):
super().__init__(**kwargs)

def _load(self):
"""Load the data from a JSON-file."""
"""Load the data from a JSON file."""
try:
with open(self._filename, 'rb') as file:
blob = file.read()
Expand All @@ -39,9 +62,11 @@ def _load(self):
return None

def _sync(self):
"""Write the data to json file."""
"""Write the data to JSON file."""
data = self.to_base()
# Serialize data:
# Converting non-string keys to string
data = _convert_key_to_str(data)
# Serialize data
blob = json.dumps(data).encode()
# When write_concern flag is set, we write the data into dummy file and then
# replace that file with original file.
Expand All @@ -57,10 +82,13 @@ def _sync(self):
file.write(blob)


JSONCollection.add_validator(json_format_validator)


class JSONDict(JSONCollection, SyncedAttrDict):
"""A dict-like mapping interface to a persistent JSON file.
The JSONDict inherits from :class:`~core.collection_api.SyncedCollection`
The JSONDict inherits from :class:`~core.synced_collection.SyncedCollection`
and :class:`~core.syncedattrdict.SyncedAttrDict`.
.. code-block:: python
Expand Down Expand Up @@ -106,7 +134,7 @@ class JSONDict(JSONCollection, SyncedAttrDict):
class JSONList(JSONCollection, SyncedList):
"""A non-string sequence interface to a persistent JSON file.
The JSONList inherits from :class:`~core.collection_api.SyncedCollection`
The JSONList inherits from :class:`~core.synced_collection.SyncedCollection`
and :class:`~core.syncedlist.SyncedList`.
.. code-block:: python
Expand Down
6 changes: 3 additions & 3 deletions signac/core/collection_mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
class MongoDBCollection(SyncedCollection):
"""Implement sync and load using a MongoDB backend."""

backend = __name__ # type: ignore
_backend = __name__ # type: ignore

def __init__(self, collection=None, **kwargs):
import bson # for InvalidDocument
import bson # for InvalidDocument error

self._collection = collection
self._errors = bson.errors
Expand Down Expand Up @@ -53,7 +53,7 @@ def _pseudo_deepcopy(self):
class MongoDBDict(MongoDBCollection, SyncedAttrDict):
"""A dict-like mapping interface to a persistent Mongo-database.
The MongoDBDict inherits from :class:`~core.collection_api.MongoCollection`
The MongoDBDict inherits from :class:`~core.synced_collection.MongoCollection`
and :class:`~core.syncedattrdict.SyncedAttrDict`.
.. code-block:: python
Expand Down
4 changes: 2 additions & 2 deletions signac/core/collection_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class RedisCollection(SyncedCollection):
"""Implement sync and load using a Redis backend."""

backend = __name__ # type: ignore
_backend = __name__ # type: ignore

def __init__(self, client=None, **kwargs):
self._client = client
Expand Down Expand Up @@ -90,7 +90,7 @@ class RedisDict(RedisCollection, SyncedAttrDict):
class RedisList(RedisCollection, SyncedList):
"""A non-string sequence interface to a persistent Redis file.
The RedisList inherits from :class:`~core.collection_api.SyncedCollection`
The RedisList inherits from :class:`~core.synced_collection.SyncedCollection`
and :class:`~core.syncedlist.SyncedList`.
.. code-block:: python
Expand Down
6 changes: 3 additions & 3 deletions signac/core/collection_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
class ZarrCollection(SyncedCollection):
"""Implement sync and load using a Zarr backend."""

backend = __name__ # type: ignore
_backend = __name__ # type: ignore

def __init__(self, group=None, **kwargs):
import numcodecs # zarr depends on numcodecs
Expand Down Expand Up @@ -47,7 +47,7 @@ def __deepcopy__(self, memo):
class ZarrDict(ZarrCollection, SyncedAttrDict):
"""A dict-like mapping interface to a persistent Zarr-database.
The ZarrDict inherits from :class:`~core.collection_api.ZarrCollection`
The ZarrDict inherits from :class:`~core.synced_collection.ZarrCollection`
and :class:`~core.syncedattrdict.SyncedAttrDict`.
.. code-block:: python
Expand Down Expand Up @@ -92,7 +92,7 @@ class ZarrDict(ZarrCollection, SyncedAttrDict):
class ZarrList(ZarrCollection, SyncedList):
"""A non-string sequence interface to a persistent Zarr file.
The ZarrList inherits from :class:`~core.collection_api.ZarrCollection`
The ZarrList inherits from :class:`~core.synced_collection.ZarrCollection`
and :class:`~core.syncedlist.SyncedList`.
.. code-block:: python
Expand Down
68 changes: 55 additions & 13 deletions signac/core/synced_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
methods the methods that must be implemented by any subclass to match the API.
"""

from typing import List, Callable
from contextlib import contextmanager
from abc import abstractmethod
from collections import defaultdict
Expand All @@ -29,7 +30,8 @@ class SyncedCollection(Collection):
in the underlying backend. The backend name wil be same as the module name.
"""

backend = None
_backend = None
_validators: List[Callable] = []

def __init__(self, name=None, parent=None):
self._data = None
Expand All @@ -41,47 +43,82 @@ def __init__(self, name=None, parent=None):
"Illegal argument combination, one of the two arguments, "
"parent or name must be None, but not both.")

@classmethod
def __init_subclass__(cls):
"""Add ``_validator`` attribute to every subclass.
Subclasses contain a list of validators that are applied to collection input data.
Every subclass must have a separate list so that distinct sets of validators can
be registered to each of them.
"""
cls._validators = []

@classmethod
def register(cls, *args):
"""Register the synced data structures.
r"""Register the synced data structures.
Registry is used when recursively converting synced data structures to determine
what to convert their children into.
The registry is used by :meth:`from_base` to determine the appropriate subclass
of :class:`SyncedCollection` that should be constructed from a particular object.
This functionality is necessary for converting nested data structures because
given, for instance, a dict of lists, it must be possible to map the nested lists to
the appropriate subclass of :class:`SyncedList` corresponding to the desired
backend.
Parameters
----------
*args
\*args
Classes to register
"""
if not hasattr(cls, 'registry'):
cls.registry = defaultdict(list)
for _cls in args:
cls.registry[_cls.backend].append(_cls)
for base_cls in args:
cls.registry[base_cls._backend].append(base_cls)

@property
def validators(self):
"""Return the list of validators applied to the instance."""
validators = []
# Classes inherit the validators of their parent classes.
for base_cls in type(self).__mro__:
if hasattr(base_cls, '_validators'):
validators.extend([v for v in base_cls._validators if v not in validators])
return validators

@classmethod
def add_validator(cls, *args):
r"""Register validator.
Parameters
----------
\*args
Validator(s) to register.
"""
cls._validators.extend([v for v in args if v not in cls._validators])

@classmethod
def from_base(cls, data, backend=None, **kwargs):
"""Dynamically resolve the type of object to the corresponding synced collection.
r"""Dynamically resolve the type of object to the corresponding synced collection.
Parameters
----------
data : any
Data to be converted from base class.
backend: str
Name of backend for synchronization. Default to backend of class.
**kwargs:
\*\*kwargs:
Kwargs passed to instance of synced collection.
Returns
-------
data : object
Synced object of corresponding base type.
"""
backend = cls.backend if backend is None else backend
backend = cls._backend if backend is None else backend
if backend is None:
raise ValueError("No backend found.")
for _cls in cls.registry[backend]:
if _cls.is_base_type(data):
return _cls(data=data, **kwargs)
for base_cls in cls.registry[backend]:
if base_cls.is_base_type(data):
return base_cls(data=data, **kwargs)
if NUMPY:
if isinstance(data, numpy.number):
return data.item()
Expand Down Expand Up @@ -133,6 +170,11 @@ def load(self):
else:
self._parent.load()

def _validate(self, data):
"""Validate the input data."""
for validator in self.validators:
validator(data)

# The following methods share a common implementation for
# all data structures and regardless of backend.
def __getitem__(self, key):
Expand Down
29 changes: 21 additions & 8 deletions signac/core/synced_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# This software is licensed under the BSD 3-Clause License.
"""Implements the SyncedList class.
This implements the list data-structure for SyncedCollection API by
implementing the convert methods (`to_base`, `from_base`) for lists.
This implements the list data structure for SyncedCollection API by
implementing the convert method `to_base` for lists.
"""

from collections.abc import Sequence
Expand All @@ -20,25 +20,27 @@
class SyncedList(SyncedCollection, MutableSequence):
"""Implementation of list data structure.
The SyncedList inherits from :class:`~core.collection_api.SyncedCollection`
The SyncedList inherits from :class:`~core.synced_collection.SyncedCollection`
and :class:`~collections.abc.MutableSequence`. Therefore, it behaves similar
to a :class:`list`.
.. warning::
While the SyncedList object behaves like a dictionary, there are
While the SyncedList object behaves like a :class:`list`, there are
important distinctions to remember. In particular, because operations
are reflected as changes to an underlying backend, copying (even deep
copying) a SyncedList instance may exhibit unexpected behavior. If a
true copy is required, you should use the `to_base()` method to get a
dictionary representation, and if necessary construct a new SyncedList.
:class:`list` representation, and if necessary construct a new
SyncedList.
"""

def __init__(self, data=None, **kwargs):
super().__init__(**kwargs)
if data is None:
self._data = []
else:
self._validate(data)
if NUMPY and isinstance(data, numpy.ndarray):
data = data.tolist()
with self._suspend_sync():
Expand Down Expand Up @@ -85,6 +87,7 @@ def _update(self, data=None):
"""Update the instance of SyncedList with data using depth-first traversal."""
if data is None:
data = []
self._validate(data)
if isinstance(data, Sequence) and not isinstance(data, str):
with self._suspend_sync():
# This loop avoids rebuilding existing synced collections for performance.
Expand Down Expand Up @@ -123,8 +126,9 @@ def reset(self, data=None):
"""
if data is None:
data = []
if NUMPY and isinstance(data, numpy.ndarray):
elif NUMPY and isinstance(data, numpy.ndarray):
data = data.tolist()
self._validate(data)
if isinstance(data, Sequence) and not isinstance(data, str):
with self._suspend_sync():
self._data = [self.from_base(data=value, parent=self) for value in data]
Expand All @@ -135,6 +139,7 @@ def reset(self, data=None):
.format(type(data)))

def __setitem__(self, key, value):
self._validate(value)
self.load()
with self._suspend_sync():
self._data[key] = self.from_base(data=value, parent=self)
Expand All @@ -145,28 +150,36 @@ def __reversed__(self):
return reversed(self._data)

def __iadd__(self, iterable):
# Convert input to a list so that iterators work as well as iterables.
iterable_data = list(iterable)
self._validate(iterable_data)
self.load()
with self._suspend_sync():
self._data += [self.from_base(data=value, parent=self) for value in iterable]
self._data += [self.from_base(data=value, parent=self) for value in iterable_data]
self.sync()
return self

def insert(self, index, item):
self._validate(item)
self.load()
with self._suspend_sync():
self._data.insert(index, self.from_base(data=item, parent=self))
self.sync()

def append(self, item):
self._validate(item)
self.load()
with self._suspend_sync():
self._data.append(self.from_base(data=item, parent=self))
self.sync()

def extend(self, iterable):
# Convert iterable to a list to ensure generators are exhausted only once
iterable_data = list(iterable)
self._validate(iterable_data)
self.load()
with self._suspend_sync():
self._data.extend([self.from_base(data=value, parent=self) for value in iterable])
self._data.extend([self.from_base(data=value, parent=self) for value in iterable_data])
self.sync()

def remove(self, item):
Expand Down
Loading

0 comments on commit ee015af

Please sign in to comment.