Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance #975

Merged
merged 23 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ceb8c9e
Make additional use of the statepoint cache.
joaander Feb 7, 2024
19cc47a
Make JobsCursor.__len__ and .__contains__ O(1).
joaander Feb 7, 2024
f7dc01f
Add validate_statepoint argument to Job.init()
joaander Feb 7, 2024
8e6e2fc
Rename statepoint_dict to statepoint_mapping.
joaander Feb 7, 2024
df2b495
Read the cache from disk in `open_job`.
joaander Feb 7, 2024
e20194f
Restore cache miss logger level to debug.
joaander Feb 7, 2024
712ed09
Instantiate Job by id directly when iterating over ids.
joaander Feb 7, 2024
15c2d60
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 7, 2024
99aaf28
Add statepoint_mapping test.
joaander Feb 8, 2024
57ef8a9
Pass information about the Job directories existence from Project to …
joaander Feb 8, 2024
04fca6e
Populate _statepoint_mapping in additional code paths.
joaander Feb 8, 2024
4f135b6
Increase test coverage.
joaander Feb 8, 2024
c95ac0b
Update change log.
joaander Feb 8, 2024
f02beb5
Rename statepoint_mapping to cached_statepoint.
joaander Feb 9, 2024
32b8fa9
Doc fixes.
joaander Feb 9, 2024
e641333
Update code comments
cbkerr Feb 9, 2024
60e76a2
Use cached_statepoint in to_dataframe.
joaander Feb 9, 2024
7578993
Restore iteration order.
joaander Feb 12, 2024
4ebf74f
Validate cached_statpoing when read from disk.
joaander Feb 12, 2024
1de7155
Use cached_statepoint in groupby.
joaander Feb 12, 2024
d48d281
Remove validate argument from update_cache.
joaander Feb 12, 2024
9eb658f
Write state point as two words in doc strings
cbkerr Feb 13, 2024
6639566
Merge branch 'main' into improve-performance
cbkerr Feb 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion changelog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@ The **signac** package follows `semantic versioning <https://semver.org/>`_.
Version 2
=========

[2.2.0] -- 2023-xx-xx
[2.2.0] -- 2024-xx-xx
---------------------

Added
+++++

- Official support for Python 3.12 (#957).
- ``Job.cached_statepoint`` - cached and read only access to job state points. Faster than
``Job.statepoint`` (#975).

Changed
+++++++

- Restrict allowable tar file features in Python 3.12 (#957).
- linked views now can contain spaces and other characters except directory separators (#926).
- linked views now can be created on Windows, if 'Developer mode' is enabled (#430).
- Increase performance for many usage patterns (#975).

Fixed
+++++
Expand Down
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ The Job class

.. autosummary::

Job.cached_statepoint
Job.clear
Job.close
Job.data
Expand Down
110 changes: 89 additions & 21 deletions signac/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import shutil
from copy import deepcopy
from threading import RLock
from types import MappingProxyType
from typing import FrozenSet

from synced_collections.backends.collection_json import (
Expand Down Expand Up @@ -248,7 +249,8 @@

Jobs can be opened by ``statepoint`` or ``id_``. If both values are
provided, it is the user's responsibility to ensure that the values
correspond.
correspond. Set ``directory_known`` to ``True`` when the job directory
is known to exist - this skips some expensive isdir checks.

Parameters
----------
Expand All @@ -258,6 +260,8 @@
State point for the job. (Default value = None)
id_ : str, optional
The job identifier. (Default value = None)
directory_known : bool, optional
Set to true when the job directory is known to exist. (Default value = False)

"""

Expand All @@ -274,30 +278,34 @@
KEY_DATA = "signac_data"
"The job's datastore key."

def __init__(self, project, statepoint=None, id_=None):
def __init__(self, project, statepoint=None, id_=None, directory_known=False):
self._project = project
self._lock = RLock()
self._initialize_lazy_properties()
self._directory_known = directory_known

if statepoint is None and id_ is None:
raise ValueError("Either statepoint or id_ must be provided.")
elif statepoint is not None:
self._statepoint_requires_init = False
try:
self._id = calc_id(statepoint) if id_ is None else id_
except TypeError:
raise KeyTypeError
self._statepoint = _StatePointDict(
jobs=[self], filename=self._statepoint_filename, data=statepoint
)

# Update the project's state point cache immediately if opened by state point
self._project._register(self.id, statepoint)
self._cached_statepoint = statepoint
self._statepoint_requires_init = True
else:
# Only an id was provided. State point will be loaded lazily.
self._id = id_
self._statepoint_requires_init = True

# Fetch the cached statepoint from the project's cache. Don't load it
# from disk on a cache miss (will be loaded on demand).
try:
self._cached_statepoint = project._sp_cache[id_]
except KeyError:
self._cached_statepoint = None

def _initialize_lazy_properties(self):
"""Initialize all properties that are designed to be loaded lazily."""
with self._lock:
Expand Down Expand Up @@ -334,7 +342,7 @@

def __repr__(self):
return "{}(project={}, statepoint={})".format(
self.__class__.__name__, repr(self._project), self.statepoint
self.__class__.__name__, repr(self._project), self.cached_statepoint
)

@property
Expand Down Expand Up @@ -406,6 +414,33 @@
statepoint.update(update)
self.statepoint = statepoint

@property
def cached_statepoint(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Must we add a new public API in order to provide the performance benefits of this PR? I am unsure if we should permit users to call this, or if it should be only leveraged internally as a private property job._cached_statepoint.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As noted below, I want to conceal as much as possible about topics like caching and validation from the user API as we can.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I intentionally make this public.

Job.statepoint is writeable and carries significant overhead from synced_collections as shown in the benchmarks - reading one key from every job's statepoint takes 116 seconds even when the statepoint is in the cache.

Many workflows only need to read the statepoint, flow in particular. While flow could internally use a private _cached_statepiont for str keys, users need public access to the fast path so that their user-defined callable methods (key, select, sort_by) can complete quickly. Many users are frustrated with 10+ minute flow status updates. As shown in the benchmarks, the same loop over projects accessing cached_statepoint completes in 0.379 seconds - 306 times faster. This alone improves flow performance tremendously when using aggregates.

The alternative API I considered was to replace statepoint with the read-only statepoint and require update_statepoint to change it. I opted for a new attribute as changing statepoint semantics is a massive breaking change.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation, that is helpful.

I would be open to changing statepoint semantics to be read-only in a future major version. We had discussed this at one point as a possibility for signac 2. Let's file an issue for that proposal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noted in #983.

"""Get a copy of the job's state point as a read-only mapping.

:py:attr:`cached_statepoint` uses the state point cache to provide fast access to
the job's state point for reading.

.. note::

Create and update the state point cache by calling
:py:meth:`project.update_cache <signac.Project.update_cache>`
or running ``signac update-cache`` on the command line.

.. seealso::

Use :py:attr:`statepoint` to modify the job's state point.

Returns
-------
Mapping
Returns the job's state point.
"""
if self._cached_statepoint is None:
self._cached_statepoint = self._project._get_statepoint(self._id)

return MappingProxyType(self._cached_statepoint)

@property
def statepoint(self):
"""Get or set the job's state point.
Expand All @@ -416,6 +451,11 @@
`Modifying the State Point
<https://docs.signac.io/en/latest/jobs.html#modifying-the-state-point>`_.

.. tip::

Use :py:attr:`cached_statepoint` for fast read-only access to the
state point.

.. warning::

The state point object behaves like a dictionary in most cases,
Expand Down Expand Up @@ -443,14 +483,25 @@
"""
with self._lock:
if self._statepoint_requires_init:
# Load state point data lazily (on access).
self._statepoint = _StatePointDict(
jobs=[self], filename=self._statepoint_filename
)
statepoint = self._statepoint.load(self.id)
if self._cached_statepoint is None:
# Load state point data lazily (on access).
self._statepoint = _StatePointDict(
jobs=[self],
filename=self._statepoint_filename,
)
statepoint = self._statepoint.load(self.id)

# Update the project's state point cache when loaded lazily
self._project._register(self.id, statepoint)
self._cached_statepoint = statepoint
else:
# Create _StatePointDict lazily with a known statepoint dict.
self._statepoint = _StatePointDict(
jobs=[self],
filename=self._statepoint_filename,
data=self._cached_statepoint,
)

# Update the project's state point cache when loaded lazily
self._project._register(self.id, statepoint)
self._statepoint_requires_init = False

return self._statepoint
Expand Down Expand Up @@ -510,7 +561,7 @@
"""
with self._lock:
if self._document is None:
self.init()
self.init(validate_statepoint=False)
fn_doc = os.path.join(self.path, self.FN_DOCUMENT)
self._document = BufferedJSONAttrDict(
filename=fn_doc, write_concern=True
Expand Down Expand Up @@ -591,9 +642,9 @@
"""
with self._lock:
if self._stores is None:
self.init()
self.init(validate_statepoint=False)

Check warning on line 645 in signac/job.py

View check run for this annotation

Codecov / codecov/patch

signac/job.py#L645

Added line #L645 was not covered by tests
self._stores = H5StoreManager(self.path)
return self.init()._stores
return self._stores

Check warning on line 647 in signac/job.py

View check run for this annotation

Codecov / codecov/patch

signac/job.py#L647

Added line #L647 was not covered by tests

@property
def data(self):
Expand Down Expand Up @@ -640,7 +691,7 @@
"""
return self._project

def init(self, force=False):
def init(self, force=False, validate_statepoint=True):
"""Initialize the job's workspace directory.

This function will do nothing if the directory and the job state point
Expand All @@ -656,6 +707,10 @@
Overwrite any existing state point files, e.g., to repair them if
they got corrupted (Default value = False).

validate_statepoint : bool, optional
When True (the default), load the job state point and ensure that it matches
the id. When False, exit early when the job directory exists.

Returns
-------
Job
Expand All @@ -671,6 +726,15 @@
"""
with self._lock:
try:
# Fast early exit when not validating.
if not validate_statepoint:
if self._directory_known:
return self

if os.path.isdir(self.path):
self._directory_known = True
return self

# Attempt early exit if the state point file exists and is valid.
try:
statepoint = self.statepoint.load(self.id)
Expand All @@ -687,6 +751,8 @@
)
raise

self._directory_known = True

# The state point save will not overwrite an existing file on
# disk unless force is True, so the subsequent load will catch
# when a preexisting invalid file was present.
Expand Down Expand Up @@ -760,6 +826,8 @@
self._document = None
self._stores = None

self._directory_known = False

def move(self, project):
"""Move this job to project.

Expand Down Expand Up @@ -899,7 +967,7 @@

"""
self._cwd.append(os.getcwd())
self.init()
self.init(validate_statepoint=False)
logger.info(f"Enter workspace '{self.path}'.")
os.chdir(self.path)

Expand Down
Loading