diff --git a/benchmarks/test_protocols.py b/benchmarks/test_protocols.py new file mode 100644 index 0000000..b14dd2e --- /dev/null +++ b/benchmarks/test_protocols.py @@ -0,0 +1,120 @@ +""" +Benchmarks for catalog access across protocols and storage locations. + +Protocols +--------- +- http: Public HTTPS endpoint (UW). No credentials required. Uses standard + HTTP range-requests to fetch Parquet row-groups. + +- aws: Public S3 bucket (AWS). No credentials required. Uses anonymous S3 + access. Requires the ``s3fs`` Python package. + +- xrootd: On USDF. Uses the XRootD protocol for efficient streaming. Requires + the ``xrootd`` and ``fsspec-xrootd`` Python packages and valid SLAC + credentials. + +- webdav: On USDF. Uses plain HTTP range-requests via the WebDAV endpoint. + Requires no credentials. +""" + +import lsdb +import pytest + +GAIA_URLS = { + "http": "https://data.lsdb.io/hats/gaia_dr3", + "aws": "s3://stpubdata/gaia/gaia_dr3/public/hats", + "xrootd": "root://sdfdtn001.slac.stanford.edu:1094//lsdb/gaia_dr3", + "webdav": "http://sdfdtn001.slac.stanford.edu:1094/lsdb/gaia_dr3", +} +all_protocols = list(GAIA_URLS.keys()) + + +# Test 1: Open catalog + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog(lbench, protocol): + """Open GAIA DR3 (reads catalog metadata only).""" + lbench(lambda: lsdb.open_catalog(GAIA_URLS[protocol])) + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog_radec(lbench, protocol): + """Open GAIA DR3 selecting only ra/dec columns.""" + lbench(lambda: lsdb.open_catalog(GAIA_URLS[protocol], columns=["ra", "dec"])) + + +# Test 2: Open catalog and compute second partition + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog_compute_partition(lbench, protocol): + """Open the catalog and compute the 2nd partition of GAIA.""" + + def open_and_compute(): + cat = lsdb.open_catalog(GAIA_URLS[protocol]) + partition = cat.partitions[1] + partition.compute() + + lbench(open_and_compute) + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_open_catalog_compute_partition_radec(lbench, protocol): + """Open the catalog and compute the 2nd partition of GAIA, using only ra/dec.""" + + def open_and_compute(): + cat = lsdb.open_catalog(GAIA_URLS[protocol], columns=["ra", "dec"]) + partition = cat.partitions[1] + partition.compute() + + lbench(open_and_compute) + + +# Test 3/4: Crossmatch GAIA at USDF with external GAIA + + +@pytest.mark.parametrize("external_protocol", ["http", "aws"]) +@pytest.mark.parametrize("internal_protocol", ["xrootd", "webdav"]) +def test_crossmatch_usdf(lbench_dask, external_protocol, internal_protocol): + """Crossmatch GAIA at USDF against an external GAIA.""" + + def crossmatch(): + cone = lsdb.ConeSearch(ra=0.0, dec=0.0, radius_arcsec=3600) + gaia_int = lsdb.open_catalog(GAIA_URLS[internal_protocol], search_filter=cone) + gaia_ext = lsdb.open_catalog(GAIA_URLS[external_protocol], search_filter=cone) + xmatch = gaia_int.crossmatch(gaia_ext, radius_arcsec=1, suffixes=("_1", "_2")) + xmatch.compute() + + lbench_dask(crossmatch) + + +@pytest.mark.parametrize("external_protocol", ["http", "aws"]) +@pytest.mark.parametrize("internal_protocol", ["xrootd", "webdav"]) +def test_crossmatch_usdf_ra_dec(lbench_dask, external_protocol, internal_protocol): + """Crossmatch GAIA at USDF against an external GAIA, with only ra/dec.""" + + def crossmatch(): + cone = lsdb.ConeSearch(ra=0.0, dec=0.0, radius_arcsec=3600) + gaia_int = lsdb.open_catalog(GAIA_URLS[internal_protocol], columns=["ra", "dec"], search_filter=cone) + gaia_ext = lsdb.open_catalog(GAIA_URLS[external_protocol], columns=["ra", "dec"], search_filter=cone) + xmatch = gaia_int.crossmatch(gaia_ext, radius_arcsec=1, suffixes=("_1", "_2")) + xmatch.compute() + + lbench_dask(crossmatch) + + +# Test 5: Cone search with magnitude cut + + +@pytest.mark.parametrize("protocol", all_protocols) +def test_cone_search_magnitude_filter(lbench_dask, protocol): + """One degree cone around (ra=0, dec=0) filtered to phot_g_mean_mag < 16.""" + + def search_and_filter(): + gaia = lsdb.open_catalog(GAIA_URLS[protocol]) + cone = gaia.cone_search(ra=0.0, dec=0.0, radius_arcsec=3600) + query = cone.query("phot_g_mean_mag < 16") + query.compute() + + lbench_dask(search_and_filter) diff --git a/pyproject.toml b/pyproject.toml index 8d8851a..01acf03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,11 @@ dev = [ "pre-commit", # Used to run checks before finalizing a git commit "pytest-cov", # Used to report total code coverage ] +protocols = [ + "xrootd", # XRootD filesystem for protocol benchmarks + "fsspec-xrootd", # XRootD integration with fsspec + "webdav4[fsspec]", # WebDAV filesystem for protocol benchmarks +] [build-system] requires = [ diff --git a/src/lbench/dashboard/metrics/metric.py b/src/lbench/dashboard/metrics/metric.py index 5fcaf45..e5ce1cb 100644 --- a/src/lbench/dashboard/metrics/metric.py +++ b/src/lbench/dashboard/metrics/metric.py @@ -107,7 +107,7 @@ def get_table_column_name(self, value=None) -> str: def get_plot_scale_and_unit(self, values) -> tuple: representative = float(values.median()) - for threshold, unit in [(1e-3, "ms"), (1e-6, "µs"), (1e-9, "ns")]: + for threshold, unit in [(60, "min"), (1, "s"), (1e-3, "ms"), (1e-6, "µs"), (1e-9, "ns")]: if representative >= threshold: return float(threshold), unit return 1.0, "s" diff --git a/src/lbench/dashboard/utils.py b/src/lbench/dashboard/utils.py index bb15396..eb38d07 100644 --- a/src/lbench/dashboard/utils.py +++ b/src/lbench/dashboard/utils.py @@ -34,7 +34,9 @@ def format_duration(seconds, digits=3): except (TypeError, ValueError): return str(seconds), "" - if seconds >= 1: + if seconds >= 60: + return f"{seconds / 60:.{digits}f}", "min" + elif seconds >= 1: return f"{seconds:.{digits}f}", "s" elif seconds >= 1e-3: return f"{seconds * 1e3:.{digits}f}", "ms"