Review dumpinvestigation.py example script:

- Move creation of search expressions to select the ICAT objects to be written into helper functions, - Get rid of legacy search expressions, use Query objects instead, - Use DISTINCT aggregator where appropriate to avoid including objects mutliple tines, - Various ckean up.
icatproject · Oct 11, 2024 · b8120d4 · b8120d4
1 parent 902c427
commit b8120d4
Showing 1 changed file with 163 additions and 101 deletions.
diff --git a/doc/examples/dumpinvestigation.py b/doc/examples/dumpinvestigation.py
@@ -17,130 +17,192 @@
 
 logging.basicConfig(level=logging.INFO)
 
-formats = icat.dumpfile.Backends.keys()
-config = icat.config.Config()
-config.add_variable('file', ("-o", "--outputfile"),
-                    dict(help="output file name or '-' for stdout"),
-                    default='-')
-config.add_variable('format', ("-f", "--format"),
-                    dict(help="output file format", choices=formats),
-                    default='YAML')
-config.add_variable('investigation', ("investigation",),
-                    dict(help="name and optionally visit id "
-                         "(separated by a colon) of the investigation"))
-client, conf = config.getconfig()
-
-if client.apiversion < '4.4':
-    raise RuntimeError("Sorry, ICAT version %s is too old, need 4.4.0 or newer."
-                       % client.apiversion)
-client.login(conf.auth, conf.credentials)
-
-
 # ------------------------------------------------------------
 # helper
 # ------------------------------------------------------------
 
-def getinvestigation(invid):
+def get_investigation_id(client, invid):
     """Search the investigation id from name and optionally visitid."""
+    query = Query(client, "Investigation", attributes=["id"])
     l = invid.split(':')
-    if len(l) == 1:
-        # No colon, invid == name
-        searchexp = "Investigation.id [name='%s']" % tuple(l)
-    elif len(l) == 2:
+    query.addConditions({"name": "= '%s'" % l[0]})
+    if len(l) == 2:
         # one colon, invid == name:visitId
-        searchexp = "Investigation.id [name='%s' AND visitId='%s']" % tuple(l)
+        query.addConditions({"visitId": "= '%s'" % l[1]})
     else:
         # too many colons
         raise RuntimeError("Invalid investigation identifier '%s'" % invid)
-    return (client.assertedSearch(searchexp)[0])
+    return client.assertedSearch(query)[0]
 
-def mergesearch(sexps):
+def mergesearch(client, queries):
     """Do many searches and merge the results in one list excluding dups."""
     objs = set()
-    for se in sexps:
+    for se in queries:
         objs.update(client.search(se))
     return list(objs)
 
+# The following helper functions control what ICAT objects are written
+# in each of the dumpfile chunks.  There are three options for the
+# items in each list: either queries expressed as Query objects, or
+# queries expressed as string expressions, or lists of objects.  In
+# the first two cases, the search results will be written, in the last
+# case, the objects are written as provided.
+
+def get_auth_types(client, invid):
+    """Users and groups related to the investigation.
+    """
+    # We need the users related to our investigation via
+    # InvestigationUser, the users member of one of the groups related
+    # via InvestigationGroup, and the instrument scientists from the
+    # instruments related to the investigations.  These are
+    # independent searches, but the results are likely to overlap.  So
+    # we need to search and merge results first.
+    usersearch = [
+        Query(client, "User", conditions={
+            "investigationUsers."
+            "investigation.id": "= %d" % invid,
+        }),
+        Query(client, "User", conditions={
+            "userGroups.grouping.investigationGroups."
+            "investigation.id": "= %d" % invid,
+        }),
+        Query(client, "User", conditions={
+            "instrumentScientists.instrument.investigationInstruments."
+            "investigation.id": "= %d" % invid,
+        }),
+    ]
+    return [
+        mergesearch(client, usersearch),
+        Query(client, "Grouping", conditions={
+            "investigationGroups.investigation.id": "= %d" % invid,
+        }, includes=["userGroups.user"], aggregate="DISTINCT", order=True),
+    ]
+
+def get_static_types(client, invid):
+    """Static stuff that exists independently of the investigation in ICAT.
+    """
+    # Similar situation for ParameterType as for User: need to merge
+    # ParameterType used for InvestigationParameter, SampleParameter,
+    # DatasetParameter, and DatafileParameter.
+    ptsearch = [
+        Query(client, "ParameterType", conditions={
+            "investigationParameters."
+            "investigation.id": "= %d" % invid,
+        }, includes=["facility", "permissibleStringValues"]),
+        Query(client, "ParameterType", conditions={
+            "sampleParameters.sample."
+            "investigation.id": "= %d" % invid,
+        }, includes=["facility", "permissibleStringValues"]),
+        Query(client, "ParameterType", conditions={
+            "datasetParameters.dataset."
+            "investigation.id": "= %d" % invid,
+        }, includes=["facility", "permissibleStringValues"]),
+        Query(client, "ParameterType", conditions={
+            "datafileParameters.datafile.dataset."
+            "investigation.id": "= %d" % invid,
+        }, includes=["facility", "permissibleStringValues"]),
+    ]
+    return [
+        Query(client, "Facility",
+              conditions={
+                  "investigations.id": "= %d" % invid,
+              },
+              order=True),
+        Query(client, "Instrument",
+              conditions={
+                  "investigationInstruments.investigation.id": "= %d" % invid,
+              },
+              includes=["facility", "instrumentScientists.user"],
+              order=True),
+        mergesearch(client, ptsearch),
+        Query(client, "InvestigationType",
+              conditions={
+                  "investigations.id": "= %d" % invid,
+              },
+              includes=["facility"],
+              order=True),
+        Query(client, "SampleType",
+              conditions={
+                  "samples.investigation.id": "= %d" % invid,
+              },
+              includes=["facility"],
+              aggregate="DISTINCT",
+              order=True),
+        Query(client, "DatasetType",
+              conditions={
+                  "datasets.investigation.id": "= %d" % invid,
+              },
+              includes=["facility"],
+              aggregate="DISTINCT",
+              order=True),
+        Query(client, "DatafileFormat",
+              conditions={
+                  "datafiles.dataset.investigation.id": "= %d" % invid,
+              },
+              includes=["facility"],
+              aggregate="DISTINCT",
+              order=True),
+    ]
+
+def get_investigation_types(client, invid):
+    """The investigation and all the stuff that belongs to it.
+    """
+    # The set of objects to be included in the Investigation.
+    inv_includes = {
+        "facility", "type.facility", "investigationInstruments",
+        "investigationInstruments.instrument.facility", "shifts",
+        "keywords", "publications", "investigationUsers",
+        "investigationUsers.user", "investigationGroups",
+        "investigationGroups.grouping", "parameters",
+        "parameters.type.facility"
+    }
+    return [
+        Query(client, "Investigation",
+              conditions={"id":"in (%d)" % invid},
+              includes=inv_includes),
+        Query(client, "Sample",
+              conditions={"investigation.id":"= %d" % invid},
+              includes={"investigation", "type.facility",
+                        "parameters", "parameters.type.facility"},
+              order=True),
+        Query(client, "Dataset",
+              conditions={"investigation.id":"= %d" % invid},
+              includes={"investigation", "type.facility", "sample",
+                        "parameters", "parameters.type.facility"},
+              order=True),
+        Query(client, "Datafile",
+              conditions={"dataset.investigation.id":"= %d" % invid},
+              includes={"dataset", "datafileFormat.facility",
+                        "parameters", "parameters.type.facility"},
+              order=True)
+    ]
 
 # ------------------------------------------------------------
 # Do it
 # ------------------------------------------------------------
 
-invid = getinvestigation(conf.investigation)
-
+formats = icat.dumpfile.Backends.keys()
+config = icat.config.Config()
+config.add_variable('file', ("-o", "--outputfile"),
+                    dict(help="output file name or '-' for stdout"),
+                    default='-')
+config.add_variable('format', ("-f", "--format"),
+                    dict(help="output file format", choices=formats),
+                    default='YAML')
+config.add_variable('investigation', ("investigation",),
+                    dict(help="name and optionally visit id "
+                         "(separated by a colon) of the investigation"))
+client, conf = config.getconfig()
 
-# We need the users related to our investigation via
-# InvestigationUser, the users member of one of the groups related via
-# InvestigationGroup, and the instrument scientists from the
-# instruments related to the investigations.  These are independent
-# searches, but the results are likely to overlap.  So we need to
-# search and merge results first.  Similar situation for ParameterType.
-usersearch = [("User <-> InvestigationUser <-> Investigation [id=%d]"),
-              ("User <-> UserGroup <-> Grouping <-> InvestigationGroup "
-               "<-> Investigation [id=%d]"),
-              ("User <-> InstrumentScientist <-> Instrument "
-               "<-> InvestigationInstrument <-> Investigation [id=%d]")]
-ptsearch = [("ParameterType INCLUDE Facility, PermissibleStringValue "
-             "<-> InvestigationParameter <-> Investigation [id=%d]"),
-            ("ParameterType INCLUDE Facility, PermissibleStringValue "
-             "<-> SampleParameter <-> Sample <-> Investigation [id=%d]"),
-            ("ParameterType INCLUDE Facility, PermissibleStringValue "
-             "<-> DatasetParameter <-> Dataset <-> Investigation [id=%d]"),
-            ("ParameterType INCLUDE Facility, PermissibleStringValue "
-             "<-> DatafileParameter <-> Datafile <-> Dataset "
-             "<-> Investigation [id=%d]"), ]
+if client.apiversion < '4.4':
+    raise RuntimeError("Sorry, ICAT version %s is too old, need 4.4.0 or newer."
+                       % client.apiversion)
+client.login(conf.auth, conf.credentials)
 
-# The set of objects to be included in the Investigation.
-inv_includes = { "facility", "type.facility", "investigationInstruments",
-                 "investigationInstruments.instrument.facility", "shifts",
-                 "keywords", "publications", "investigationUsers",
-                 "investigationUsers.user", "investigationGroups",
-                 "investigationGroups.grouping", "parameters",
-                 "parameters.type.facility" }
 
-# The following lists control what ICAT objects are written in each of
-# the dumpfile chunks.  There are three options for the items in each
-# list: either queries expressed as Query objects, or queries
-# expressed as string expressions, or lists of objects.  In the first
-# two cases, the seacrh results will be written, in the last case, the
-# objects are written as provided.  We assume that there is only one
-# relevant facility, e.g. that all objects related to the
-# investigation are related to the same facility.  We may thus ommit
-# the facility from the ORDER BY clauses.
-authtypes =   [mergesearch([s % invid for s in usersearch]),
-               ("Grouping ORDER BY name INCLUDE UserGroup, User "
-                "<-> InvestigationGroup <-> Investigation [id=%d]" % invid)]
-statictypes = [("Facility ORDER BY name"),
-               ("Instrument ORDER BY name "
-                "INCLUDE Facility, InstrumentScientist, User "
-                "<-> InvestigationInstrument <-> Investigation [id=%d]"
-                % invid),
-               (mergesearch([s % invid for s in ptsearch])),
-               ("InvestigationType ORDER BY name INCLUDE Facility "
-                "<-> Investigation [id=%d]" % invid),
-               ("SampleType ORDER BY name, molecularFormula INCLUDE Facility "
-                "<-> Sample <-> Investigation [id=%d]" % invid),
-               ("DatasetType ORDER BY name INCLUDE Facility "
-                "<-> Dataset <-> Investigation [id=%d]" % invid),
-               ("DatafileFormat ORDER BY name, version INCLUDE Facility "
-                "<-> Datafile <-> Dataset <-> Investigation [id=%d]" % invid)]
-investtypes = [Query(client, "Investigation",
-                     conditions={"id":"in (%d)" % invid},
-                     includes=inv_includes),
-               Query(client, "Sample", order=["name"],
-                     conditions={"investigation.id":"= %d" % invid},
-                     includes={"investigation", "type.facility",
-                               "parameters", "parameters.type.facility"}),
-               Query(client, "Dataset", order=["name"],
-                     conditions={"investigation.id":"= %d" % invid},
-                     includes={"investigation", "type.facility", "sample",
-                               "parameters", "parameters.type.facility"}),
-               Query(client, "Datafile", order=["dataset.name", "name"],
-                     conditions={"dataset.investigation.id":"= %d" % invid},
-                     includes={"dataset", "datafileFormat.facility",
-                               "parameters", "parameters.type.facility"})]
+invid = get_investigation_id(client, conf.investigation)
 
 with open_dumpfile(client, conf.file, conf.format, 'w') as dumpfile:
-    dumpfile.writedata(authtypes)
-    dumpfile.writedata(statictypes)
-    dumpfile.writedata(investtypes)
+    dumpfile.writedata(get_auth_types(client, invid))
+    dumpfile.writedata(get_static_types(client, invid))
+    dumpfile.writedata(get_investigation_types(client, invid))