From 5b2cc5316714d07223deb3f15469ad2988fb3e28 Mon Sep 17 00:00:00 2001 From: Markus Demleitner Date: Fri, 28 Jun 2024 13:50:50 +0200 Subject: [PATCH 1/3] Fixing the doctest for datalink in dal/index.rst. Using this opportunity to speed up the doctest by using smaller/faster services. It's now running in 10 seconds on my box in my neck of the net, which sounds a lot more reasonable than what was there before. --- docs/dal/index.rst | 70 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/docs/dal/index.rst b/docs/dal/index.rst index 6df144de..84ec4ce0 100644 --- a/docs/dal/index.rst +++ b/docs/dal/index.rst @@ -76,6 +76,8 @@ astrometric parameter, such as waveband ranges. Some services also accept `~astropy.time.Time` as ``time`` parameter. +.. doctest:: + >>> from astropy.time import Time >>> time = Time(('2015-01-01T00:00:00', '2018-01-01T00:00:00'), ... format='isot', scale='utc') @@ -171,6 +173,16 @@ mean G-band magnitude between 19 - 20: 2171810342771336704 323.25913736080776 51.94305655940998 19.0 2180349528028140800 310.5233961869657 50.3486391034819 19.0 +While DALResultsTable has some convenience functions, is is often +convenient to directly obtain a proper astropy Table using the +``to_table`` method: + +.. doctest-remote-data:: + + >>> result.to_table().columns[:3] + + + To explore more query examples, you can try either the ``description`` attribute for some services. For other services like this one, try the ``examples`` attribute. @@ -193,21 +205,15 @@ the ``examples`` attribute. POINT(l.raj2000, l.dej2000) )<0.0002 -- fine selection with PMs -Furthermore, one can find the names of the tables using: +TAPServices let you do extensive metadata inspection. For instance, +to see the tables available on the Simbad TAP service, say: .. doctest-remote-data:: - >>> print([tab_name for tab_name in tap_service.tables.keys()]) # doctest: +IGNORE_WARNINGS - ['ivoa.obs_radio', 'ivoa.obscore', 'tap_schema.columns', 'tap_schema.tables',..., 'taptest.main', 'veronqsos.data', 'vlastripe82.stripe82'] - + >>> simbad = vo.dal.TAPService("http://simbad.cds.unistra.fr/simbad/sim-tap") + >>> print([tab_name for tab_name in simbad.tables.keys()]) # doctest: +IGNORE_WARNINGS + ['TAP_SCHEMA.schemas', 'TAP_SCHEMA.tables', 'TAP_SCHEMA.columns', 'TAP_SCHEMA.keys', ... 'mesVelocities', 'mesXmm', 'otypedef', 'otypes', 'ref'] -And also the names of the columns from a known table, for instance -the first three columns: - -.. doctest-remote-data:: - - >>> result.table.columns[:3] # doctest: +IGNORE_WARNINGS - If you know a TAP service's access URL, you can directly pass it to :py:class:`~pyvo.dal.TAPService` to obtain a service object. @@ -347,7 +353,7 @@ you reach the ``maxrec`` limit: .. doctest-remote-data:: - >>> tap_results = tap_service.search("SELECT * FROM ivoa.obscore", maxrec=100000) # doctest: +SHOW_WARNINGS + >>> tap_results = tap_service.search("SELECT * FROM arihip.main", maxrec=5) # doctest: +SHOW_WARNINGS DALOverflowWarning: Partial result set. Potential causes MAXREC, async storage space, etc. Services will not let you raise maxrec beyond the hard match limit: @@ -522,8 +528,10 @@ region is always circular with ``pos`` as center: .. doctest-remote-data:: - >>> ssa_service = vo.dal.SSAService("https://irsa.ipac.caltech.edu/SSA") + >>> ssa_service = vo.dal.SSAService("http://archive.stsci.edu/ssap/search2.php?id=BEFS&") >>> ssa_results = ssa_service.search(pos=pos, diameter=size) + >>> ssa_results[0].getdataurl() + 'http://archive.stsci.edu/pub/vospectra/...' SSA queries can be further constrained by the ``band`` and ``time`` parameters. @@ -800,11 +808,39 @@ identifying the dataset you want it to return. >>> preview = next(row.getdatalink().bysemantics('#preview')).getdataset() -.. note:: - Since the creation of datalink objects requires a network roundtrip, it is - recommended to call ``getdatalink`` only once. +.. doctest-remote-data:: + >>> rows = vo.dal.TAPService("http://dc.g-vo.org/tap" + ... ).run_sync("select top 5 * from califadr3.cubes order by califaid") + >>> for dl in rows.iter_datalinks(): # doctest: +IGNORE_WARNINGS + ... print(next(dl.bysemantics("#preview"))["access_url"]) + http://dc.zah.uni-heidelberg.de/getproduct/califa/datadr3/V1200/IC5376.V1200.rscube.fits?preview=True + http://dc.zah.uni-heidelberg.de/getproduct/califa/datadr3/COMB/IC5376.COMB.rscube.fits?preview=True + http://dc.zah.uni-heidelberg.de/getproduct/califa/datadr3/V500/IC5376.V500.rscube.fits?preview=True + http://dc.zah.uni-heidelberg.de/getproduct/califa/datadr3/COMB/UGC00005.COMB.rscube.fits?preview=True + http://dc.zah.uni-heidelberg.de/getproduct/califa/datadr3/V1200/UGC00005.V1200.rscube.fits?preview=True + +The call to ``next`` in this example picks the first link marked +*preview*. For previews, this may be enough, but in general there can +be multiple links for a given semantics value for one dataset. + +It is sometimes useful to go back to the original row the datalink was +generated from; use the ``original_row`` attribute for that (which may +be None if pyvo does not know what row the datalink came from): + +.. doctest-remote-data:: + >>> dl.original_row["obs_title"] + 'CALIFA V1200 UGC00005' + +Rows from tables supporting datalink also have a ``getdatalink()`` +method that returns a ``DatalinkResults`` instance. In general, this is +less flexible than using ``iter_datalinks``, and it may also cause more +network traffic because each such call will cause a network request. -Of course one can also build a datalink object from its url. +When one has a link to a Datalink document – for instance, from an +obscore or SIAP service, where the media type is +application/x-votable;content=datalink –, one can build a +DatalinkResults using +:py:meth:`~pyvo.dal.adhoc.DatalinkResults.from_result_url`: .. doctest-remote-data:: From f7b0f79cab9948c8c0f2538b9e566ffc0c6a7322 Mon Sep 17 00:00:00 2001 From: Markus Demleitner Date: Thu, 20 Jun 2024 08:58:00 +0200 Subject: [PATCH 2/3] Adding an .original_row attribute to DatalinkResults. It also does some minor improvements to the documentation of how to use datalink. But that part could really use a lot more love... --- CHANGES.rst | 3 ++ docs/dal/index.rst | 34 ++++++++++++++--------- pyvo/dal/adhoc.py | 49 ++++++++++++++++++++++++++------- pyvo/dal/tests/test_datalink.py | 8 +++++- 4 files changed, 70 insertions(+), 24 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 1f8c57ce..aefe677d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -37,6 +37,9 @@ Enhancements and Fixes - RegTAP constraints involving tables other than rr.resource are now done via subqueries for less duplication of interfaces. [#562] +- Where datalink records are made from table rows, the table row is + now accessible as datalinks.original_row. [] + Deprecations and Removals ------------------------- diff --git a/docs/dal/index.rst b/docs/dal/index.rst index 84ec4ce0..3b7b9882 100644 --- a/docs/dal/index.rst +++ b/docs/dal/index.rst @@ -796,17 +796,22 @@ as quantities): >>> astropy_table = resultset.to_table() >>> astropy_qtable = resultset.to_qtable() -Multiple datasets ------------------ -PyVO supports multiple datasets exposed on record level through the datalink. -To get an iterator yielding specific datasets, call -:py:meth:`pyvo.dal.adhoc.DatalinkResults.bysemantics` with the identifier -identifying the dataset you want it to return. +Datalink +-------- -.. remove skip once https://github.com/astropy/pyvo/issues/361 is fixed -.. doctest-skip:: +Datalink lets operators associate multiple artefacts with a dataset. +Examples include linking raw data, applicable or applied calibration +data, derived datasets such as extracted sources, extra documentation, +and much more. - >>> preview = next(row.getdatalink().bysemantics('#preview')).getdataset() +Datalink can both be used on result rows of queries and from +datalink-valued URLs. The typical use is to call ``iter_datalinks()`` +on some DAL result; this will iterate over all datalinks pyVO finds in a +document and yields :py:class:`pyvo.dal.adhoc.DatalinkResults` instances +for them. In those, you can, for instance, pick out items by semantics, +where the standard vocabulary datalink documents use is documented at +http://www.ivoa.net/rdf/datalink/core. Here is how to find URLs for +previews: .. doctest-remote-data:: >>> rows = vo.dal.TAPService("http://dc.g-vo.org/tap" @@ -848,6 +853,9 @@ DatalinkResults using >>> # In this example you know the URL from somewhere >>> url = 'https://ws.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/caom2ops/datalink?ID=ivo%3A%2F%2Fcadc.nrc.ca%2FHSTHLA%3Fhst_12477_28_acs_wfc_f606w_01%2Fhst_12477_28_acs_wfc_f606w_01_drz' >>> datalink = DatalinkResults.from_result_url(url) + >>> next(datalink.bysemantics("#this")).content_type + 'application/fits' + Server-side processing ---------------------- @@ -855,8 +863,8 @@ Some services support the server-side processing of record datasets. This includes spatial cutouts for 2d-images, reducing of spectra to a certain waveband range, and many more depending on the service. -Datalink -^^^^^^^^ +Generic Datalink Processing Service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Generic access to processing services is provided through the datalink interface. @@ -866,8 +874,8 @@ interface. >>> datalink_proc = next(row.getdatalink().bysemantics('#proc')) .. note:: - most times there is only one processing service per result, and thats all you - need. + Most datalink documents only have one processing service per dataset, + which is why there is the ``get_first_proc`` shortcut mentioned below. The returned object lets you access the available input parameters which you diff --git a/pyvo/dal/adhoc.py b/pyvo/dal/adhoc.py index ccfab6ad..8d92e3b7 100644 --- a/pyvo/dal/adhoc.py +++ b/pyvo/dal/adhoc.py @@ -198,7 +198,10 @@ def iter_datalinks(self): if batch_size is None: # first call. self.query = DatalinkQuery.from_resource( - [_ for _ in self], self._datalink, session=self._session) + [_ for _ in self], + self._datalink, + session=self._session, + original_row=row) remaining_ids = self.query['ID'] if not remaining_ids: # we are done @@ -217,9 +220,13 @@ def iter_datalinks(self): id1 = current_ids.pop(0) processed_ids.append(id1) remaining_ids.remove(id1) - yield current_batch.clone_byid(id1) + yield current_batch.clone_byid( + id1, + original_row=row) elif row.access_format == DATALINK_MIME_TYPE: - yield DatalinkResults.from_result_url(row.getdataurl()) + yield DatalinkResults.from_result_url( + row.getdataurl(), + original_row=row) else: yield None @@ -366,6 +373,8 @@ def from_resource(cls, rows, resource, *, session=None, **kwargs): ref="srcGroup"/> """ + original_row = kwargs.pop("original_row", None) + input_params = _get_input_params_from_resource(resource) # get params outside of any group dl_params = _get_params_from_resource(resource) @@ -402,7 +411,11 @@ def from_resource(cls, rows, resource, *, session=None, **kwargs): except KeyError: query_params[name] = query_param - return cls(accessurl, session=session, **query_params) + return cls( + accessurl, + session=session, + original_row=original_row, + **query_params) def __init__( self, baseurl, *, id=None, responseformat=None, session=None, **keywords): @@ -420,6 +433,8 @@ def __init__( session : object optional session to use for network requests """ + self.original_row = keywords.pop("original_row", None) + super().__init__(baseurl, session=session, **keywords) if id is not None: @@ -441,8 +456,11 @@ def execute(self, post=False): DALFormatError for errors parsing the VOTable response """ - return DatalinkResults(self.execute_votable(post=post), - url=self.queryurl, session=self._session) + return DatalinkResults( + self.execute_votable(post=post), + url=self.queryurl, + original_row=self.original_row, + session=self._session) class DatalinkResults(DatalinkResultsMixin, DALResults): @@ -488,6 +506,10 @@ class DatalinkResults(DatalinkResultsMixin, DALResults): a Numpy array. """ + def __init__(self, *args, **kwargs): + self.original_row = kwargs.pop("original_row", None) + super().__init__(*args, **kwargs) + def getrecord(self, index): """ return a representation of a datalink result record that follows @@ -503,7 +525,7 @@ def getrecord(self, index): Returns ------- - REc + Rec a dictionary-like wrapper containing the result record metadata. Raises @@ -569,10 +591,10 @@ def bysemantics(self, semantics, *, include_narrower=True): if record.semantics in semantics: yield record - def clone_byid(self, id): + def clone_byid(self, id, *, original_row=None): """ return a clone of the object with results and corresponding - resources matching a given id + resources matching a given id Returns ------- @@ -597,7 +619,7 @@ def clone_byid(self, id): for x in copy_tb.resources: if x.ID and x.ID not in referenced_serviced: copy_tb.resources.remove(x) - return DatalinkResults(copy_tb) + return DatalinkResults(copy_tb, original_row=original_row) def getdataset(self, *, timeout=None): """ @@ -629,6 +651,13 @@ def get_first_proc(self): return proc raise IndexError("No processing service found in datalink result") + @classmethod + def from_result_url(cls, result_url, *, session=None, original_row=None): + res = super(DatalinkResults, cls).from_result_url( + result_url, session=session) + res.original_row = original_row + return res + class SodaRecordMixin: """ diff --git a/pyvo/dal/tests/test_datalink.py b/pyvo/dal/tests/test_datalink.py index 676ff214..daf77b27 100644 --- a/pyvo/dal/tests/test_datalink.py +++ b/pyvo/dal/tests/test_datalink.py @@ -110,6 +110,8 @@ def test_datalink(): datalinks = next(results.iter_datalinks()) + assert datalinks.original_row["accsize"] == 100800 + row = datalinks[0] assert row.semantics == "#progenitor" @@ -132,7 +134,9 @@ def test_datalink_batch(): results = vo.dal.imagesearch( 'http://example.com/obscore', (30, 30)) - assert len([_ for _ in results.iter_datalinks()]) == 3 + dls = list(results.iter_datalinks()) + assert len(dls) == 3 + assert dls[0].original_row["obs_collection"] == "MACHO" @pytest.mark.usefixtures('proc', 'datalink_vocabulary') @@ -143,6 +147,8 @@ def test_datalink_batch(): class TestSemanticsRetrieval: def test_access_with_string(self): datalinks = DatalinkResults.from_result_url('http://example.com/proc') + + assert datalinks.original_row is None res = [r["access_url"] for r in datalinks.bysemantics("#this")] assert len(res) == 1 assert res[0].endswith("eq010000ms/20100927.comb_avg.0001.fits.fz") From 84d48e44eecd00fb7b16390fc94abdc9d05bd2ca Mon Sep 17 00:00:00 2001 From: Markus Demleitner Date: Tue, 2 Jul 2024 09:38:24 +0200 Subject: [PATCH 3/3] Editorial changes after @bsipocz's PR review. --- docs/dal/index.rst | 3 +++ pyvo/dal/adhoc.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/dal/index.rst b/docs/dal/index.rst index 3b7b9882..6f664eb4 100644 --- a/docs/dal/index.rst +++ b/docs/dal/index.rst @@ -836,6 +836,9 @@ be None if pyvo does not know what row the datalink came from): >>> dl.original_row["obs_title"] 'CALIFA V1200 UGC00005' +Consider ``original_row`` read only. We do not define what happens when +you modify it. + Rows from tables supporting datalink also have a ``getdatalink()`` method that returns a ``DatalinkResults`` instance. In general, this is less flexible than using ``iter_datalinks``, and it may also cause more diff --git a/pyvo/dal/adhoc.py b/pyvo/dal/adhoc.py index 8d92e3b7..48e4fc39 100644 --- a/pyvo/dal/adhoc.py +++ b/pyvo/dal/adhoc.py @@ -653,8 +653,7 @@ def get_first_proc(self): @classmethod def from_result_url(cls, result_url, *, session=None, original_row=None): - res = super(DatalinkResults, cls).from_result_url( - result_url, session=session) + res = super().from_result_url(result_url, session=session) res.original_row = original_row return res