From ef203f5ed3525f84bddd675299a69587a7b52132 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 20 May 2026 15:57:08 -0400 Subject: [PATCH 1/3] Add benchmarks for xrootd and webdav --- benchmarks/test_protocols.py | 124 +++++++++++++++++++++++++++++++++++ pyproject.toml | 5 ++ 2 files changed, 129 insertions(+) create mode 100644 benchmarks/test_protocols.py diff --git a/benchmarks/test_protocols.py b/benchmarks/test_protocols.py new file mode 100644 index 0000000..06bfe54 --- /dev/null +++ b/benchmarks/test_protocols.py @@ -0,0 +1,124 @@ +""" +Benchmarks for catalog access across protocols and storage locations. + +Protocols +--------- +- http: Public HTTPS endpoint (UW). No credentials required. Uses standard + HTTP range-requests to fetch Parquet row-groups. + +- aws: Public S3 bucket (AWS). No credentials required. Uses anonymous S3 + access. Requires the ``s3fs`` Python package. + +- xrootd: On USDF. Uses the XRootD protocol for efficient streaming. Requires + the ``xrootd`` and ``fsspec-xrootd`` Python packages and valid SLAC + credentials. + +- webdav: On USDF. Uses plain HTTP range-requests via the WebDAV endpoint. + Requires no credentials. +""" + +import pytest +import lsdb + +GAIA_URLS = { + "http": "https://data.lsdb.io/hats/gaia_dr3", + "aws": "s3://stpubdata/gaia/gaia_dr3/public/hats", + "xrootd": "root://sdfdtn001.slac.stanford.edu:1094//lsdb/gaia_dr3", + "webdav": "http://sdfdtn001.slac.stanford.edu:1094/lsdb/gaia_dr3", +} +all_protocols = list(GAIA_URLS.keys()) + + +# Test 1: Open catalog + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog(lbench, protocol): + """Open GAIA DR3 (reads catalog metadata only).""" + lbench(lambda: lsdb.open_catalog(GAIA_URLS[protocol])) + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog_radec(lbench, protocol): + """Open GAIA DR3 selecting only ra/dec columns.""" + lbench(lambda: lsdb.open_catalog(GAIA_URLS[protocol], columns=["ra", "dec"])) + + +# Test 2: Open catalog and compute second partition + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog_compute_partition(lbench, protocol): + """Open the catalog and compute the 2nd partition of GAIA.""" + + def open_and_compute(): + cat = lsdb.open_catalog(GAIA_URLS[protocol]) + partition = cat.partitions[1] + partition.compute() + + lbench(open_and_compute) + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog_compute_partition_radec(lbench, protocol): + """Open the catalog and compute the 2nd partition of GAIA, using only ra/dec.""" + + def open_and_compute(): + cat = lsdb.open_catalog(GAIA_URLS[protocol], columns=["ra", "dec"]) + partition = cat.partitions[1] + partition.compute() + + lbench(open_and_compute) + + +# Test 3/4: Crossmatch GAIA at USDF with external GAIA + + +@pytest.mark.parametrize("external_protocol", ["http", "aws"]) +@pytest.mark.parametrize("internal_protocol", ["xrootd", "webdav"]) +def test_crossmatch_usdf(lbench_dask, external_protocol, internal_protocol): + """Crossmatch GAIA at USDF against an external GAIA.""" + + def crossmatch(): + cone = lsdb.ConeSearch(ra=0.0, dec=0.0, radius_arcsec=3600) + gaia_int = lsdb.open_catalog(GAIA_URLS[internal_protocol], search_filter=cone) + gaia_ext = lsdb.open_catalog(GAIA_URLS[external_protocol], search_filter=cone) + xmatch = gaia_int.crossmatch(gaia_ext, radius_arcsec=1, suffixes=("_1", "_2")) + xmatch.compute() + + lbench_dask(crossmatch) + + +@pytest.mark.parametrize("external_protocol", ["http", "aws"]) +@pytest.mark.parametrize("internal_protocol", ["xrootd", "webdav"]) +def test_crossmatch_usdf_ra_dec(lbench_dask, external_protocol, internal_protocol): + """Crossmatch GAIA at USDF against an external GAIA, with only ra/dec.""" + + def crossmatch(): + cone = lsdb.ConeSearch(ra=0.0, dec=0.0, radius_arcsec=3600) + gaia_int = lsdb.open_catalog( + GAIA_URLS[internal_protocol], columns=["ra", "dec"], search_filter=cone + ) + gaia_ext = lsdb.open_catalog( + GAIA_URLS[external_protocol], columns=["ra", "dec"], search_filter=cone + ) + xmatch = gaia_int.crossmatch(gaia_ext, radius_arcsec=1, suffixes=("_1", "_2")) + xmatch.compute() + + lbench_dask(crossmatch) + + +# Test 5: Cone search with magnitude cut + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_cone_search_magnitude_filter(lbench_dask, protocol): + """One degree cone around (ra=0, dec=0) filtered to phot_g_mean_mag < 16.""" + + def search_and_filter(): + gaia = lsdb.open_catalog(GAIA_URLS[protocol]) + cone = gaia.cone_search(ra=0.0, dec=0.0, radius_arcsec=3600) + query = cone.query("phot_g_mean_mag < 16") + query.compute() + + lbench_dask(search_and_filter) diff --git a/pyproject.toml b/pyproject.toml index 8d8851a..01acf03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,11 @@ dev = [ "pre-commit", # Used to run checks before finalizing a git commit "pytest-cov", # Used to report total code coverage ] +protocols = [ + "xrootd", # XRootD filesystem for protocol benchmarks + "fsspec-xrootd", # XRootD integration with fsspec + "webdav4[fsspec]", # WebDAV filesystem for protocol benchmarks +] [build-system] requires = [ From 42d91416d0f650cecee1b86cc11b0473dacf061c Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Wed, 20 May 2026 16:28:36 -0400 Subject: [PATCH 2/3] Apply black formatting --- benchmarks/test_protocols.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/benchmarks/test_protocols.py b/benchmarks/test_protocols.py index 06bfe54..b14dd2e 100644 --- a/benchmarks/test_protocols.py +++ b/benchmarks/test_protocols.py @@ -17,8 +17,8 @@ Requires no credentials. """ -import pytest import lsdb +import pytest GAIA_URLS = { "http": "https://data.lsdb.io/hats/gaia_dr3", @@ -96,12 +96,8 @@ def test_crossmatch_usdf_ra_dec(lbench_dask, external_protocol, internal_protoco def crossmatch(): cone = lsdb.ConeSearch(ra=0.0, dec=0.0, radius_arcsec=3600) - gaia_int = lsdb.open_catalog( - GAIA_URLS[internal_protocol], columns=["ra", "dec"], search_filter=cone - ) - gaia_ext = lsdb.open_catalog( - GAIA_URLS[external_protocol], columns=["ra", "dec"], search_filter=cone - ) + gaia_int = lsdb.open_catalog(GAIA_URLS[internal_protocol], columns=["ra", "dec"], search_filter=cone) + gaia_ext = lsdb.open_catalog(GAIA_URLS[external_protocol], columns=["ra", "dec"], search_filter=cone) xmatch = gaia_int.crossmatch(gaia_ext, radius_arcsec=1, suffixes=("_1", "_2")) xmatch.compute() From ab09adaff2514151b148d30e59abcd399b7e2ce2 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Thu, 21 May 2026 11:52:54 -0400 Subject: [PATCH 3/3] Fix units in plotting --- src/lbench/dashboard/metrics/metric.py | 2 +- src/lbench/dashboard/utils.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lbench/dashboard/metrics/metric.py b/src/lbench/dashboard/metrics/metric.py index 5fcaf45..e5ce1cb 100644 --- a/src/lbench/dashboard/metrics/metric.py +++ b/src/lbench/dashboard/metrics/metric.py @@ -107,7 +107,7 @@ def get_table_column_name(self, value=None) -> str: def get_plot_scale_and_unit(self, values) -> tuple: representative = float(values.median()) - for threshold, unit in [(1e-3, "ms"), (1e-6, "µs"), (1e-9, "ns")]: + for threshold, unit in [(60, "min"), (1, "s"), (1e-3, "ms"), (1e-6, "µs"), (1e-9, "ns")]: if representative >= threshold: return float(threshold), unit return 1.0, "s" diff --git a/src/lbench/dashboard/utils.py b/src/lbench/dashboard/utils.py index bb15396..eb38d07 100644 --- a/src/lbench/dashboard/utils.py +++ b/src/lbench/dashboard/utils.py @@ -34,7 +34,9 @@ def format_duration(seconds, digits=3): except (TypeError, ValueError): return str(seconds), "" - if seconds >= 1: + if seconds >= 60: + return f"{seconds / 60:.{digits}f}", "min" + elif seconds >= 1: return f"{seconds:.{digits}f}", "s" elif seconds >= 1e-3: return f"{seconds * 1e3:.{digits}f}", "ms"