From 247b2e095defa5761606a31f0950895ae33ef245 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 14 Oct 2024 13:30:05 -0700 Subject: [PATCH] Stop using Butler regular expression searches Using regular expressions to search the Butler for collection names is deprecated in RFC-1040. The previous regular expression workaround was also potentially incorrect -- it would have matched any collection containing the given string, not just the exact string. --- python/lsst/ap/verify/ingestion.py | 10 ++++---- python/lsst/ap/verify/pipeline_driver.py | 5 +++- python/lsst/ap/verify/workspace.py | 29 ++++++++++++++++++++---- tests/test_ingestion.py | 5 +--- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/python/lsst/ap/verify/ingestion.py b/python/lsst/ap/verify/ingestion.py index 1b8713a2..d65e79bd 100644 --- a/python/lsst/ap/verify/ingestion.py +++ b/python/lsst/ap/verify/ingestion.py @@ -31,7 +31,6 @@ import fnmatch import os -import re import shutil from glob import glob import logging @@ -138,9 +137,12 @@ def _ensureRaws(self, processes): RuntimeError Raised if there are no files to ingest. """ - # TODO: regex is workaround for DM-25945 - rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) - rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) + try: + collectionName = self.dataset.instrument.makeDefaultRawIngestRunName() + rawCollections = list(self.workspace.workButler.registry.queryCollections(collectionName)) + except lsst.daf.butler.MissingCollectionError: + rawCollections = [] + rawData = list(self.workspace.workButler.registry.queryDatasets( 'raw', collections=rawCollections, diff --git a/python/lsst/ap/verify/pipeline_driver.py b/python/lsst/ap/verify/pipeline_driver.py index 37c6c0d5..651d2542 100644 --- a/python/lsst/ap/verify/pipeline_driver.py +++ b/python/lsst/ap/verify/pipeline_driver.py @@ -277,7 +277,10 @@ def _getCollectionArguments(workspace, reuse): # skip-existing-in would work around that, but would lead to a worse bug in # the case that the user is alternating runs with and without --clean-run. # registry.refresh() - oldRuns = list(registry.queryCollections(re.compile(workspace.outputName + r"/\d+T\d+Z"))) + collectionPattern = re.compile(workspace.outputName + r"/\d+T\d+Z") + oldRuns = list(registry.queryCollections(workspace.outputName + "/*")) + oldRuns = [run for run in oldRuns if collectionPattern.match(run)] + if reuse and oldRuns: args.extend(["--extend-run", "--skip-existing"]) return args diff --git a/python/lsst/ap/verify/workspace.py b/python/lsst/ap/verify/workspace.py index 6fc04cab..4721e6c8 100644 --- a/python/lsst/ap/verify/workspace.py +++ b/python/lsst/ap/verify/workspace.py @@ -26,7 +26,6 @@ import abc import os import pathlib -import re import stat import lsst.daf.butler as dafButler @@ -219,10 +218,31 @@ def _ensureCollection(self, registry, name, collectionType): The type of collection to add. This field is ignored when testing if a collection exists. """ - matchingCollections = list(registry.queryCollections(re.compile(name))) - if not matchingCollections: + if not self._doesCollectionExist(registry, name): registry.registerCollection(name, type=collectionType) + def _doesCollectionExist(self, registry, name): + """Check if a collection exists in the registry. + + Parameters + ---------- + registry : `lsst.daf.butler.Registry` + The repository that may contain the collection. + name : `str` + The name of the collection to check for existence. + + Returns + ------- + exists : `bool` + `True` if the collection exists in the registry, `False` otherwise. + + """ + try: + matchingCollections = list(registry.queryCollections(name)) + return len(matchingCollections) > 0 + except dafButler.MissingCollectionError: + return False + @property def workButler(self): """A Butler that can read and write to a Gen 3 repository (`lsst.daf.butler.Butler`, read-only). @@ -247,8 +267,7 @@ def workButler(self): # Create an output chain here, so that workButler can see it. # Definition does not conflict with what pipetask --output uses. - # Regex is workaround for DM-25945. - if not list(queryButler.registry.queryCollections(re.compile(self.outputName))): + if not self._doesCollectionExist(queryButler.registry, self.outputName): queryButler.registry.registerCollection(self.outputName, dafButler.CollectionType.CHAINED) queryButler.registry.setCollectionChain(self.outputName, inputs) diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 5ec12c5e..048a51a4 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -23,7 +23,6 @@ import os import pickle -import re import shutil import tempfile import unittest @@ -133,9 +132,7 @@ def testCalibIngestDriver(self): # queryDatasets cannot (yet) search CALIBRATION collections, so we # instead search the RUN-type collections that calibrations are # ingested into first before being associated with a validity range. - calibrationRunPattern = re.compile( - re.escape(self.dataset.instrument.makeCollectionName("calib") + "/") + ".+" - ) + calibrationRunPattern = self.dataset.instrument.makeCollectionName("calib") + "/*" calibrationRuns = list( self.butler.registry.queryCollections( calibrationRunPattern,