Skip to content

Commit

Permalink
Stop using Butler regular expression searches
Browse files Browse the repository at this point in the history
Using regular expressions to search the Butler for collection names is
deprecated in RFC-1040.  The previous regular expression workaround was
also potentially incorrect -- it would have matched any collection
containing the given string, not just the exact string.
  • Loading branch information
dhirving committed Oct 15, 2024
1 parent 8818d04 commit 247b2e0
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 14 deletions.
10 changes: 6 additions & 4 deletions python/lsst/ap/verify/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@

import fnmatch
import os
import re
import shutil
from glob import glob
import logging
Expand Down Expand Up @@ -138,9 +137,12 @@ def _ensureRaws(self, processes):
RuntimeError
Raised if there are no files to ingest.
"""
# TODO: regex is workaround for DM-25945
rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
try:
collectionName = self.dataset.instrument.makeDefaultRawIngestRunName()
rawCollections = list(self.workspace.workButler.registry.queryCollections(collectionName))
except lsst.daf.butler.MissingCollectionError:
rawCollections = []

rawData = list(self.workspace.workButler.registry.queryDatasets(
'raw',
collections=rawCollections,
Expand Down
5 changes: 4 additions & 1 deletion python/lsst/ap/verify/pipeline_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,10 @@ def _getCollectionArguments(workspace, reuse):
# skip-existing-in would work around that, but would lead to a worse bug in
# the case that the user is alternating runs with and without --clean-run.
# registry.refresh()
oldRuns = list(registry.queryCollections(re.compile(workspace.outputName + r"/\d+T\d+Z")))
collectionPattern = re.compile(workspace.outputName + r"/\d+T\d+Z")
oldRuns = list(registry.queryCollections(workspace.outputName + "/*"))
oldRuns = [run for run in oldRuns if collectionPattern.match(run)]

if reuse and oldRuns:
args.extend(["--extend-run", "--skip-existing"])
return args
Expand Down
29 changes: 24 additions & 5 deletions python/lsst/ap/verify/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import abc
import os
import pathlib
import re
import stat

import lsst.daf.butler as dafButler
Expand Down Expand Up @@ -219,10 +218,31 @@ def _ensureCollection(self, registry, name, collectionType):
The type of collection to add. This field is ignored when
testing if a collection exists.
"""
matchingCollections = list(registry.queryCollections(re.compile(name)))
if not matchingCollections:
if not self._doesCollectionExist(registry, name):
registry.registerCollection(name, type=collectionType)

def _doesCollectionExist(self, registry, name):
"""Check if a collection exists in the registry.
Parameters
----------
registry : `lsst.daf.butler.Registry`
The repository that may contain the collection.
name : `str`
The name of the collection to check for existence.
Returns
-------
exists : `bool`
`True` if the collection exists in the registry, `False` otherwise.
"""
try:
matchingCollections = list(registry.queryCollections(name))
return len(matchingCollections) > 0
except dafButler.MissingCollectionError:
return False

@property
def workButler(self):
"""A Butler that can read and write to a Gen 3 repository (`lsst.daf.butler.Butler`, read-only).
Expand All @@ -247,8 +267,7 @@ def workButler(self):

# Create an output chain here, so that workButler can see it.
# Definition does not conflict with what pipetask --output uses.
# Regex is workaround for DM-25945.
if not list(queryButler.registry.queryCollections(re.compile(self.outputName))):
if not self._doesCollectionExist(queryButler.registry, self.outputName):
queryButler.registry.registerCollection(self.outputName,
dafButler.CollectionType.CHAINED)
queryButler.registry.setCollectionChain(self.outputName, inputs)
Expand Down
5 changes: 1 addition & 4 deletions tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import os
import pickle
import re
import shutil
import tempfile
import unittest
Expand Down Expand Up @@ -133,9 +132,7 @@ def testCalibIngestDriver(self):
# queryDatasets cannot (yet) search CALIBRATION collections, so we
# instead search the RUN-type collections that calibrations are
# ingested into first before being associated with a validity range.
calibrationRunPattern = re.compile(
re.escape(self.dataset.instrument.makeCollectionName("calib") + "/") + ".+"
)
calibrationRunPattern = self.dataset.instrument.makeCollectionName("calib") + "/*"
calibrationRuns = list(
self.butler.registry.queryCollections(
calibrationRunPattern,
Expand Down

0 comments on commit 247b2e0

Please sign in to comment.