Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-46599: Stop using Butler regular expression searches #239

Merged
merged 1 commit into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions python/lsst/ap/verify/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@

import fnmatch
import os
import re
import shutil
from glob import glob
import logging
Expand Down Expand Up @@ -138,9 +137,12 @@ def _ensureRaws(self, processes):
RuntimeError
Raised if there are no files to ingest.
"""
# TODO: regex is workaround for DM-25945
rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
try:
collectionName = self.dataset.instrument.makeDefaultRawIngestRunName()
rawCollections = list(self.workspace.workButler.registry.queryCollections(collectionName))
except lsst.daf.butler.MissingCollectionError:
rawCollections = []

rawData = list(self.workspace.workButler.registry.queryDatasets(
'raw',
collections=rawCollections,
Expand Down
5 changes: 4 additions & 1 deletion python/lsst/ap/verify/pipeline_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,10 @@ def _getCollectionArguments(workspace, reuse):
# skip-existing-in would work around that, but would lead to a worse bug in
# the case that the user is alternating runs with and without --clean-run.
# registry.refresh()
oldRuns = list(registry.queryCollections(re.compile(workspace.outputName + r"/\d+T\d+Z")))
collectionPattern = re.compile(workspace.outputName + r"/\d+T\d+Z")
oldRuns = list(registry.queryCollections(workspace.outputName + "/*"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor comment but does it make things faster or slower if we use a more constrained glob such as /????????T*Z or /*T*Z?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all happening in a small, temporary repo, so I don't think performance is an issue.

oldRuns = [run for run in oldRuns if collectionPattern.fullmatch(run)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure I understand this -- regexes are being phased out, but globs are not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct -- globs will continue to work.


if reuse and oldRuns:
args.extend(["--extend-run", "--skip-existing"])
return args
Expand Down
29 changes: 24 additions & 5 deletions python/lsst/ap/verify/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import abc
import os
import pathlib
import re
import stat

import lsst.daf.butler as dafButler
Expand Down Expand Up @@ -219,10 +218,31 @@ def _ensureCollection(self, registry, name, collectionType):
The type of collection to add. This field is ignored when
testing if a collection exists.
"""
matchingCollections = list(registry.queryCollections(re.compile(name)))
if not matchingCollections:
if not self._doesCollectionExist(registry, name):
registry.registerCollection(name, type=collectionType)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that registerCollection is now(?) idempotent, I wonder if this code could be removed entirely...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this one can. The one below is less clear since it avoids modifying the chain if the collection already existed.

I'm gonna leave it alone to avoid needing to re-test, though.


def _doesCollectionExist(self, registry, name):
"""Check if a collection exists in the registry.

Parameters
----------
registry : `lsst.daf.butler.Registry`
The repository that may contain the collection.
name : `str`
The name of the collection to check for existence.

Returns
-------
exists : `bool`
`True` if the collection exists in the registry, `False` otherwise.

"""
try:
matchingCollections = list(registry.queryCollections(name))
return len(matchingCollections) > 0
except dafButler.MissingCollectionError:
return False

@property
def workButler(self):
"""A Butler that can read and write to a Gen 3 repository (`lsst.daf.butler.Butler`, read-only).
Expand All @@ -247,8 +267,7 @@ def workButler(self):

# Create an output chain here, so that workButler can see it.
# Definition does not conflict with what pipetask --output uses.
# Regex is workaround for DM-25945.
if not list(queryButler.registry.queryCollections(re.compile(self.outputName))):
if not self._doesCollectionExist(queryButler.registry, self.outputName):
queryButler.registry.registerCollection(self.outputName,
dafButler.CollectionType.CHAINED)
queryButler.registry.setCollectionChain(self.outputName, inputs)
Expand Down
5 changes: 1 addition & 4 deletions tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import os
import pickle
import re
import shutil
import tempfile
import unittest
Expand Down Expand Up @@ -133,9 +132,7 @@ def testCalibIngestDriver(self):
# queryDatasets cannot (yet) search CALIBRATION collections, so we
# instead search the RUN-type collections that calibrations are
# ingested into first before being associated with a validity range.
calibrationRunPattern = re.compile(
re.escape(self.dataset.instrument.makeCollectionName("calib") + "/") + ".+"
)
calibrationRunPattern = self.dataset.instrument.makeCollectionName("calib") + "/*"
calibrationRuns = list(
self.butler.registry.queryCollections(
calibrationRunPattern,
Expand Down
Loading