Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lazy statepoint loading #239

Merged
merged 37 commits into from
Dec 30, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
8b14b15
Refactor uses of job._statepoint to job.statepoint.
bdice Oct 31, 2019
24d3b8c
Implement lazy loading of statepoints.
bdice Oct 31, 2019
6c57f11
Use cheaper condition for determining if the job id exists.
bdice Oct 31, 2019
f4a31e8
Fix flake8.
bdice Oct 31, 2019
737170a
Validate that the state point manifest hash matches the job id when l…
bdice Nov 1, 2019
82683b7
Merge branch 'master' into feature/lazy-statepoint-loading
bdice Nov 16, 2019
a40c22a
Added changelog line.
bdice Nov 18, 2019
51b6c26
Add explicit benchmark for iterating and loading state point.
csadorf Nov 18, 2019
9bcceaf
Merge branch 'master' into feature/lazy-statepoint-loading
bdice Nov 19, 2019
c51b413
Use _sp_cache to cache list of known valid job ids.
bdice Nov 19, 2019
9824ec7
Merge branch 'feature/lazy-statepoint-loading' of https://github.com/…
bdice Nov 19, 2019
67ba8da
Deregister jobs from the statepoint cache when they are removed from …
bdice Nov 19, 2019
be044f0
Fix issues with registration/deregistration.
bdice Nov 19, 2019
88c5369
Merge remote-tracking branch 'origin/master' into feature/lazy-statep…
bdice Dec 22, 2020
5c4e284
Update comment.
bdice Dec 28, 2020
bf9108e
Merge remote-tracking branch 'origin/master' into feature/lazy-statep…
bdice Dec 28, 2020
52643e0
Remove deregistration logic.
bdice Dec 28, 2020
6c5dcd2
Add comment explaining behavior of Project._sp_cache.
bdice Dec 28, 2020
0e7d6c3
Update docstrings and deprecated methods.
bdice Dec 28, 2020
10926f9
Update changelog.
bdice Dec 28, 2020
ce5ce6e
Revise implementation details and docstrings.
bdice Dec 28, 2020
0b65502
Update docstrings to use state point.
bdice Dec 28, 2020
206928c
Update comment.
bdice Dec 28, 2020
80acc1b
Update _register method.
bdice Dec 28, 2020
3f93f14
Use job.id instead of job._id and job_id instead of jobid (excluding …
bdice Dec 28, 2020
5304fff
Use descriptive variable names and comments.
bdice Dec 28, 2020
ebad980
Use descriptive variable names.
bdice Dec 29, 2020
eed20a4
Improve cache registration behavior and comments.
bdice Dec 29, 2020
76b5933
Remove extra blank lines.
bdice Dec 29, 2020
baf36e5
Merge branch 'master' into feature/lazy-statepoint-loading
bdice Dec 29, 2020
b029e6c
Improve cache filling behavior.
bdice Dec 29, 2020
9a12405
Revert project._sp_cache so it is not pre-populated with None values.…
bdice Dec 29, 2020
6494150
Add test of error when opening without a state point or job id.
bdice Dec 29, 2020
347b99e
Merge branch 'master' into feature/lazy-statepoint-loading
bdice Dec 29, 2020
7ca5d97
improve wording of docstring of FN_MANIFEST
cbkerr Dec 30, 2020
a8815d2
Fix documentation issues.
bdice Dec 30, 2020
ecdc7dc
Update docstrings and comments.
bdice Dec 30, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions signac/contrib/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,21 @@ class Job(object):

KEY_DATA = 'signac_data'

def __init__(self, project, statepoint, _id=None):
def __init__(self, project, statepoint=None, _id=None):
self._project = project

# Set statepoint and id
self._statepoint = SyncedAttrDict(statepoint, parent=_sp_save_hook(self))
self._id = calc_id(self._statepoint()) if _id is None else _id
if statepoint is None and _id is None:
raise ValueError('Either statepoint or _id must be provided.')

# Set statepoint if provided
if statepoint is not None:
self._statepoint = SyncedAttrDict(statepoint, parent=_sp_save_hook(self))
else:
# Statepoint will be loaded lazily
self._statepoint = None

# Set id
self._id = calc_id(self.statepoint()) if _id is None else _id

# Prepare job working directory
self._wd = os.path.join(project.workspace(), self._id)
Expand Down Expand Up @@ -89,7 +98,7 @@ def __str__(self):
def __repr__(self):
return "{}(project={}, statepoint={})".format(
self.__class__.__name__,
repr(self._project), self._statepoint)
repr(self._project), self.statepoint)

def __eq__(self, other):
return hash(self) == hash(other)
Expand Down Expand Up @@ -145,7 +154,7 @@ def reset_statepoint(self, new_statepoint):
else:
raise
# Update this instance
self._statepoint._data = dst._statepoint._data
self.statepoint._data = dst.statepoint._data
self._id = dst._id
self._wd = dst._wd
self._fn_doc = dst._fn_doc
Expand Down Expand Up @@ -203,6 +212,11 @@ def statepoint(self):
`sp_dict = job.statepoint()` instead of `sp = job.statepoint`.
For more information, see :class:`~signac.JSONDict`.
"""
if self._statepoint is None:
# Loads statepoint file lazily
statepoint = self._project.get_statepoint(self._id)
self._statepoint = SyncedAttrDict(statepoint, parent=_sp_save_hook(self))

return self._statepoint

@statepoint.setter
Expand Down Expand Up @@ -326,7 +340,7 @@ def _init(self, force=False):

try:
# Ensure to create the binary to write before file creation
blob = json.dumps(self._statepoint, indent=2)
blob = json.dumps(self.statepoint, indent=2)

try:
# Open the file for writing only if it does not exist yet.
Expand Down Expand Up @@ -568,7 +582,7 @@ def __exit__(self, err_type, err_value, tb):

def __setstate__(self, state):
self.__dict__.update(state)
self._statepoint._parent.jobs.append(self)
self.statepoint._parent.jobs.append(self)

def __deepcopy__(self, memo):
cls = self.__class__
Expand Down
21 changes: 18 additions & 3 deletions signac/contrib/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,12 @@ def open_job(self, statepoint=None, id=None):
id = matches[0]
elif len(matches) > 1:
raise LookupError(id)
return self.Job(project=self, statepoint=self.get_statepoint(id), _id=id)
bdice marked this conversation as resolved.
Show resolved Hide resolved
else:
# By elimination, len(matches) == 0
raise KeyError(id)
elif not self._contains_job_id(id):
raise KeyError(id)
return self.Job(project=self, statepoint=None, _id=id)

def _job_dirs(self):
try:
Expand Down Expand Up @@ -396,6 +401,16 @@ def num_jobs(self):

__len__ = num_jobs

def _contains_job_id(self, id):
"""Determine whether job id is in the project's data space.

:param id: The job id to test for initialization.
:type id: str
:returns: True when the job is initialized for this project.
:rtype: bool
"""
return os.path.exists(os.path.join(self._wd, id))

def __contains__(self, job):
"""Determine whether job is in the project's data space.

Expand All @@ -404,7 +419,7 @@ def __contains__(self, job):
:returns: True when the job is initialized for this project.
:rtype: bool
"""
return os.path.exists(os.path.join(self._wd, job.get_id()))
return self._contains_job_id(job.get_id())

@deprecated(deprecated_in="1.3", removed_in="2.0", current_version=__version__)
def build_job_search_index(self, index, _trust=False):
Expand Down Expand Up @@ -704,7 +719,7 @@ def write_statepoints(self, statepoints=None, fn=None, indent=2):

def _register(self, job):
"Register the job within the local index."
self._sp_cache[job._id] = job._statepoint._as_dict()
self._sp_cache[job._id] = job.statepoint()

def _get_statepoint_from_workspace(self, jobid):
"Attempt to read the statepoint from the workspace."
Expand Down
4 changes: 2 additions & 2 deletions tests/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def test_corrupted_statepoint_file(self):
try:
logging.disable(logging.CRITICAL)
with self.assertRaises(JobsCorruptedError):
self.project.open_job(id=job.get_id())
self.project.open_job(id=job.get_id()).statepoint
finally:
logging.disable(logging.NOTSET)

Expand Down Expand Up @@ -467,7 +467,7 @@ def test_repair_corrupted_workspace(self):
# Iterating through the jobs should now result in an error.
with self.assertRaises(JobsCorruptedError):
for job in self.project:
pass
job.statepoint

with self.assertRaises(JobsCorruptedError):
self.project.repair()
Expand Down