diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index ab079070..6cd7efe1 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -871,13 +871,30 @@ def __init__( self.canonical_url: str | None = None axes = _extract_axes(args) - # No chunkable axes → skip ``build_request`` entirely; the - # common Water Data call shape shouldn't pay for an unused - # request prep on the passthrough hot path. The fetcher - # will run with the user's args verbatim; if that produces - # an over-budget URL, the server (or httpx itself) rejects. if not axes: - return + # No chunkable axis: nothing to split. If the single request fits, + # run it verbatim (the common passthrough). ``_safe_request_bytes`` + # treats an un-constructable URL (httpx.InvalidURL, > 64 KB) as over + # budget. + if _safe_request_bytes(build_request, args, url_limit) <= url_limit: + return + # Over budget. A filter the chunker doesn't manage — cql-json — is + # passed through unchanged (chunking applies only to cql-text); the + # server, not us, judges it. Otherwise this is an in-domain shape we + # would normally chunk but can't (a single large CQL ``IN`` clause + # with no top-level ``OR``, or one oversized value), so raise an + # actionable error instead of shipping it for an opaque HTTP 414. + filter_expr = args.get("filter") + if filter_expr is not None and not _is_chunkable( + filter_expr, args.get("filter_lang") + ): + return + raise RequestTooLarge( + f"Request exceeds {url_limit} bytes (URL + body) and has no " + f"chunkable multi-value argument to split (e.g. a single large " + f"CQL `IN` clause, or one oversized value). Narrow the query, " + f"simplify the filter, or split the call manually." + ) # Constructing the initial request can itself trip # ``httpx.InvalidURL`` (URL > 64 KB) — that's the canonical diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py index a435a4de..8fede638 100644 --- a/tests/waterdata_chunking_test.py +++ b/tests/waterdata_chunking_test.py @@ -161,10 +161,30 @@ def test_extract_axes_skips_singletons_and_never_chunk_params(): def test_chunk_plan_returns_passthrough_when_no_chunkable_axes(): - """Scalar args with nothing to chunk → passthrough, even at a - URL limit the request technically exceeds (the server may 414, - but ``ChunkPlan`` has nothing to split).""" + """Scalar args with nothing to chunk and a request within the limit → + passthrough (no axes).""" args = {"monitoring_location_id": "scalar-only"} + plan = ChunkPlan(args, _fake_build, url_limit=1000) + assert plan.axes == [] + assert plan.total == 1 + + +def test_chunk_plan_raises_when_unchunkable_request_exceeds_limit(): + """A request with nothing to chunk that still exceeds the byte limit (e.g. + a single large CQL ``IN`` clause with no top-level ``OR``) raises + RequestTooLarge instead of being shipped for the server to reject with an + opaque HTTP 414.""" + args = {"monitoring_location_id": "scalar-only"} + with pytest.raises(RequestTooLarge): + ChunkPlan(args, _fake_build, url_limit=10) + + +def test_chunk_plan_passes_through_unchunkable_cql_json_over_limit(): + """A cql-json filter is outside the chunker's domain (it splits only + cql-text), so an over-budget cql-json request is passed through unchanged + instead of raising — the server judges it, not us. Guards against the + chunker hijacking the deliberate cql-json passthrough.""" + args = {"filter": "a OR b OR c", "filter_lang": "cql-json"} plan = ChunkPlan(args, _fake_build, url_limit=10) assert plan.axes == [] assert plan.total == 1