From 4750898bf209072ed7c30c92a01c6e6dc85f5f9a Mon Sep 17 00:00:00 2001 From: Justin Braaten Date: Wed, 15 Apr 2026 19:02:47 +0000 Subject: [PATCH] Add configurable getInfo retry handling and docs --- docs/faq.md | 31 +++++++++ docs/guide.md | 34 ++++++++++ docs/performance.md | 36 +++++++++++ xee/ext.py | 38 ++++++++++- xee/ext_integration_test.py | 26 ++++++++ xee/ext_test.py | 125 ++++++++++++++++++++++++++++++++++++ xee/helpers.py | 18 +++++- xee/retries.py | 66 +++++++++++++++++++ 8 files changed, 370 insertions(+), 4 deletions(-) create mode 100644 xee/retries.py diff --git a/docs/faq.md b/docs/faq.md index 18b1acc..5a45311 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -13,6 +13,37 @@ Use `grid_shape` when a fixed pixel width/height is required (e.g., ML model inp ## I get 429 quota errors. What do I do? Reduce parallelism (fewer Dask workers), narrow the AOI or time range, combine server-side operations before opening, or switch to the standard endpoint for computed collections. +Xee also exposes retry controls for both pixel fetches and metadata `getInfo()` +calls. For example: + +```python +import xarray as xr + +ds = xr.open_dataset( + 'ee://ECMWF/ERA5_LAND/MONTHLY_AGGR', + engine='ee', + crs='EPSG:4326', + crs_transform=(0.25, 0, -180, 0, -0.25, 90), + shape_2d=(1440, 720), + getitem_kwargs={ + 'max_retries': 8, + 'initial_delay': 500, + }, + getinfo_kwargs={ + 'max_retries': 8, + 'initial_delay': 1000, + }, +) +``` + +Defaults are: + +- `getitem_kwargs`: `max_retries=6`, `initial_delay=500` ms +- `getinfo_kwargs`: `max_retries=6`, `initial_delay=1000` ms + +`helpers.extract_grid_params(...)` also supports `getinfo_kwargs` if metadata +calls need tuning under quota pressure. + ## Can I open a computed `ee.ImageCollection`? Yes. Build the collection with filtering / mapping functions, then pass the resulting collection object directly to `xr.open_dataset(..., engine='ee')` with grid parameters. diff --git a/docs/guide.md b/docs/guide.md index 1678fc4..5e94bf8 100644 --- a/docs/guide.md +++ b/docs/guide.md @@ -154,6 +154,40 @@ temp_slice = ds['temperature_2m'].isel(time=0) temp_slice.plot() ``` +## Configure Retries + +Xee supports configurable retries for two paths: + +- Pixel reads via `getitem_kwargs` +- Metadata `getInfo()` calls via `getinfo_kwargs` + +```python +import ee +import xarray as xr +from xee import helpers + +ic = ee.ImageCollection('ECMWF/ERA5_LAND/MONTHLY_AGGR') + +# Optional: tune helper metadata fetch retries. +grid_params = helpers.extract_grid_params( + ic, + getinfo_kwargs={'max_retries': 8, 'initial_delay': 1000}, +) + +ds = xr.open_dataset( + ic, + engine='ee', + **grid_params, + getitem_kwargs={'max_retries': 8, 'initial_delay': 500}, + getinfo_kwargs={'max_retries': 8, 'initial_delay': 1000}, +) +``` + +Defaults: + +- `getitem_kwargs`: `max_retries=6`, `initial_delay=500` ms +- `getinfo_kwargs`: `max_retries=6`, `initial_delay=1000` ms + ## Further Resources - [Core Concepts](concepts.md) diff --git a/docs/performance.md b/docs/performance.md index 4a312f6..1b5a995 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -26,6 +26,42 @@ Recommendations: 3. Consolidate operations server-side (EE `.map`, `.select`, band math) before opening in Xee. 4. Cache intermediate results in memory rather than re-opening repeatedly. +## Retry Tuning + +Xee uses exponential backoff with jitter for: + +- Pixel requests (`getitem_kwargs`) used during array reads. +- Metadata `getInfo()` requests (`getinfo_kwargs`) used during dataset setup and + helper metadata fetches. + +Defaults: + +- `getitem_kwargs`: `max_retries=6`, `initial_delay=500` ms +- `getinfo_kwargs`: `max_retries=6`, `initial_delay=1000` ms + +`getinfo_kwargs` starts with a longer default delay to reduce setup-time retry bursts against EE metadata endpoints. + +You can tune these in `xr.open_dataset(...)`: + +```python +ds = xr.open_dataset( + collection, + engine='ee', + crs='EPSG:4326', + crs_transform=(0.25, 0, -180, 0, -0.25, 90), + shape_2d=(1440, 720), + getitem_kwargs={'max_retries': 8, 'initial_delay': 500}, + getinfo_kwargs={'max_retries': 8, 'initial_delay': 1000}, +) +``` + +Rule of thumb: + +1. If failures happen during dataset open / metadata fetch, tune + `getinfo_kwargs` first. +2. If failures happen during chunk reads / compute, tune `getitem_kwargs` first. +3. Reduce Dask concurrency before increasing retries too aggressively. + ## Chunk Size Considerations EE responses have an upper size limit (tens of MB). Xee's backend picks reasonable pixel window sizes automatically. If you see many small requests, consider choosing a coarser grid or limiting variable selection to needed bands. diff --git a/xee/ext.py b/xee/ext.py index 8669500..0e9bfcc 100644 --- a/xee/ext.py +++ b/xee/ext.py @@ -41,6 +41,7 @@ from xarray.backends import store as backends_store from xarray.core import indexing from xarray.core import utils +from xee import retries from xee import types import ee @@ -128,6 +129,11 @@ class EarthEngineStore(common.AbstractDataStore): 'initial_delay': 500, } + GETINFO_KWARGS: dict[str, int] = { + 'max_retries': 6, + 'initial_delay': 1000, + } + SCALE_UNITS: dict[str, int] = { 'degree': 1, 'metre': 10_000, @@ -166,6 +172,7 @@ def open( ee_init_if_necessary: bool = False, executor_kwargs: dict[str, Any] | None = None, getitem_kwargs: dict[str, int] | None = None, + getinfo_kwargs: dict[str, int] | None = None, fast_time_slicing: bool = False, ) -> EarthEngineStore: if mode != 'r': @@ -188,6 +195,7 @@ def open( ee_init_if_necessary=ee_init_if_necessary, executor_kwargs=executor_kwargs, getitem_kwargs=getitem_kwargs, + getinfo_kwargs=getinfo_kwargs, fast_time_slicing=fast_time_slicing, ) @@ -207,6 +215,7 @@ def __init__( ee_init_if_necessary: bool = False, executor_kwargs: dict[str, Any] | None = None, getitem_kwargs: dict[str, int] | None = None, + getinfo_kwargs: dict[str, int] | None = None, fast_time_slicing: bool = False, ): # Ensure crs_transform is a tuple and create the affine.Affine object. @@ -236,6 +245,7 @@ def __init__( self.executor_kwargs = executor_kwargs self.getitem_kwargs = {**self.GETITEM_KWARGS, **(getitem_kwargs or {})} + self.getinfo_kwargs = {**self.GETINFO_KWARGS, **(getinfo_kwargs or {})} self.image_collection = image_collection if n_images != -1: @@ -306,7 +316,11 @@ def get_info(self) -> dict[str, Any]: ) ) - info = ee.List([rpc for _, rpc in rpcs]).getInfo() + info = retries.robust_call( + lambda: ee.List([rpc for _, rpc in rpcs]).getInfo(), + catch=ee.ee_exception.EEException, + **self.getinfo_kwargs, + ) return dict(zip((name for name, _ in rpcs), info)) @@ -657,9 +671,20 @@ def _ee_bounds_to_bounds(bounds: dict[str, Any]) -> types.Bounds: return x_min, y_min, x_max, y_max -def geometry_to_bounds(geom: ee.Geometry) -> types.Bounds: +def geometry_to_bounds( + geom: ee.Geometry, + getinfo_kwargs: dict[str, int] | None = None, +) -> types.Bounds: """Finds the CRS bounds from a ee.Geometry polygon.""" - bounds = geom.bounds().getInfo() + getinfo_kwargs = { + **EarthEngineStore.GETINFO_KWARGS, + **(getinfo_kwargs or {}), + } + bounds = retries.robust_call( + lambda: geom.bounds().getInfo(), + catch=ee.ee_exception.EEException, + **getinfo_kwargs, + ) return _ee_bounds_to_bounds(bounds) @@ -920,6 +945,7 @@ def open_dataset( ee_init_kwargs: dict[str, Any] | None = None, executor_kwargs: dict[str, Any] | None = None, getitem_kwargs: dict[str, int] | None = None, + getinfo_kwargs: dict[str, int] | None = None, fast_time_slicing: bool = False, ) -> xarray.Dataset: # type: ignore """Open an Earth Engine ImageCollection as an Xarray Dataset. @@ -989,6 +1015,11 @@ def open_dataset( - 'max_retries', the maximum number of retry attempts. Defaults to 6. - 'initial_delay', the initial delay in milliseconds before the first retry. Defaults to 500. + getinfo_kwargs (optional): Exponential backoff kwargs applied to + Earth Engine `getInfo()` calls used by Xee metadata workflows. + - 'max_retries', the maximum number of retry attempts. Defaults to 6. + - 'initial_delay', the initial delay in milliseconds before the first + retry. Defaults to 1000. fast_time_slicing (optional): Whether to perform an optimization that makes slicing an ImageCollection across time faster. This optimization loads EE images in a slice by ID, so any modifications to images in a @@ -1023,6 +1054,7 @@ def open_dataset( ee_init_if_necessary=ee_init_if_necessary, executor_kwargs=executor_kwargs, getitem_kwargs=getitem_kwargs, + getinfo_kwargs=getinfo_kwargs, fast_time_slicing=fast_time_slicing, ) diff --git a/xee/ext_integration_test.py b/xee/ext_integration_test.py index 23d36ed..efb7ab5 100644 --- a/xee/ext_integration_test.py +++ b/xee/ext_integration_test.py @@ -105,6 +105,14 @@ def setUp(self): n_images=64, getitem_kwargs={'max_retries': 9}, ) + self.getinfo_tuned_store = xee.EarthEngineStore( + ee.ImageCollection('LANDSAT/LC08/C02/T1').filterDate( + '2017-01-01', '2017-01-03' + ), + n_images=64, + getinfo_kwargs={'max_retries': 9, 'initial_delay': 1200}, + **_TEST_GRID_PARAMS, + ) self.all_img_store = xee.EarthEngineStore( ee.ImageCollection('LANDSAT/LC08/C02/T1').filterDate( '2017-01-01', '2017-01-03' @@ -298,6 +306,15 @@ def test_getitem_kwargs(self): self.assertEqual(arr2.store.getitem_kwargs['initial_delay'], 500) self.assertEqual(arr2.store.getitem_kwargs['max_retries'], 9) + def test_getinfo_kwargs(self): + arr = xee.EarthEngineBackendArray('B4', self.getinfo_tuned_store) + self.assertEqual(arr.store.getinfo_kwargs['initial_delay'], 1200) + self.assertEqual(arr.store.getinfo_kwargs['max_retries'], 9) + + arr1 = xee.EarthEngineBackendArray('longitude', self.lnglat_store) + self.assertEqual(arr1.store.getinfo_kwargs['initial_delay'], 1000) + self.assertEqual(arr1.store.getinfo_kwargs['max_retries'], 6) + class EEBackendEntrypointTest(absltest.TestCase): @@ -632,6 +649,15 @@ def test_extract_grid_params_from_image(self): self.assertEqual(grid_params['crs'], 'EPSG:32613') np.allclose(grid_params['crs_transform'], [30, 0, 643185, 0, -30, 4255815]) + def test_extract_grid_params_from_image_with_getinfo_kwargs(self): + img = ee.Image('LANDSAT/LT05/C02/T1_TOA/LT05_031034_20110619') + grid_params = helpers.extract_grid_params( + img, getinfo_kwargs={'max_retries': 8, 'initial_delay': 1100} + ) + self.assertEqual(grid_params['shape_2d'], (7881, 6981)) + self.assertEqual(grid_params['crs'], 'EPSG:32613') + np.allclose(grid_params['crs_transform'], [30, 0, 643185, 0, -30, 4255815]) + def test_extract_grid_params_from_image_collection(self): dem = ee.ImageCollection('COPERNICUS/DEM/GLO30') grid_params = helpers.extract_grid_params(dem) diff --git a/xee/ext_test.py b/xee/ext_test.py index c04c08c..d6dc499 100644 --- a/xee/ext_test.py +++ b/xee/ext_test.py @@ -221,6 +221,79 @@ def test_init_with_invalid_transform_type(self): shape_2d=(360, 180), ) + @mock.patch.object( + ext.EarthEngineStore, + 'get_info', + new_callable=mock.PropertyMock, + ) + def test_getinfo_kwargs_defaults_and_overrides(self, mock_get_info): + mock_get_info.return_value = { + 'size': 1, + 'props': {}, + 'first': { + 'bands': [ + { + 'id': 'b1', + 'data_type': {'type': 'PixelType', 'precision': 'float'}, + } + ] + }, + } + + default_store = xee.EarthEngineStore( + image_collection=mock.MagicMock(), + crs='EPSG:4326', + crs_transform=(1.0, 0.0, -180.0, 0.0, -1.0, 90.0), + shape_2d=(360, 180), + ) + self.assertEqual(default_store.getinfo_kwargs['initial_delay'], 1000) + self.assertEqual(default_store.getinfo_kwargs['max_retries'], 6) + + configured_store = xee.EarthEngineStore( + image_collection=mock.MagicMock(), + crs='EPSG:4326', + crs_transform=(1.0, 0.0, -180.0, 0.0, -1.0, 90.0), + shape_2d=(360, 180), + getinfo_kwargs={'max_retries': 9}, + ) + self.assertEqual(configured_store.getinfo_kwargs['initial_delay'], 1000) + self.assertEqual(configured_store.getinfo_kwargs['max_retries'], 9) + + @mock.patch.object(ext.retries, 'robust_call') + @mock.patch.object(ext.ee, 'List') + @mock.patch.object(ext.ee, 'Reducer') + def test_get_info_uses_retry_settings( + self, mock_reducer, mock_ee_list, mock_robust_call + ): + store = object.__new__(xee.EarthEngineStore) + store.image_collection = mock.MagicMock() + store.primary_dim_property = 'system:time_start' + store.getinfo_kwargs = {'max_retries': 9, 'initial_delay': 1200} + mock_reducer.toList.return_value.repeat.return_value = mock.MagicMock() + + mock_ee_list.return_value.getInfo.return_value = [ + 1, + {}, + { + 'bands': [ + { + 'id': 'b1', + 'data_type': {'type': 'PixelType', 'precision': 'float'}, + } + ] + }, + (['id-1'], [123456]), + ] + mock_robust_call.return_value = mock_ee_list.return_value.getInfo.return_value + + info = store.get_info + + kwargs = mock_robust_call.call_args.kwargs + self.assertEqual(kwargs['catch'], ext.ee.ee_exception.EEException) + self.assertEqual(kwargs['max_retries'], 9) + self.assertEqual(kwargs['initial_delay'], 1200) + self.assertEqual(info['size'], 1) + class ParseEEInitKwargsTest(absltest.TestCase): @@ -384,6 +457,58 @@ def test_fit_geometry_with_rounding(self): self.assertAlmostEqual(grid_dict['crs_transform'][0], 0.1) self.assertAlmostEqual(grid_dict['crs_transform'][4], -0.1) + @mock.patch.object(ext.retries, 'robust_call') + def test_geometry_to_bounds_uses_retry_settings(self, mock_robust_call): + mock_robust_call.return_value = { + 'coordinates': [[[1.0, 2.0], [3.0, 2.0], [3.0, 5.0], [1.0, 5.0]]] + } + geom = mock.MagicMock() + + bounds = ext.geometry_to_bounds( + geom, + getinfo_kwargs={'max_retries': 8, 'initial_delay': 1400}, + ) + + kwargs = mock_robust_call.call_args.kwargs + self.assertEqual(kwargs['catch'], ext.ee.ee_exception.EEException) + self.assertEqual(kwargs['max_retries'], 8) + self.assertEqual(kwargs['initial_delay'], 1400) + self.assertEqual(bounds, (1.0, 2.0, 3.0, 5.0)) + + @mock.patch.object(helpers.retries, 'robust_call') + def test_extract_grid_params_uses_retry_settings(self, mock_robust_call): + class FakeImage: + pass + + class FakeImageCollection: + pass + + image = FakeImage() + mock_robust_call.return_value = { + 'bands': [ + { + 'crs': 'EPSG:4326', + 'crs_transform': [1.0, 0.0, 2.0, 0.0, -1.0, 3.0], + 'dimensions': [4, 5], + } + ] + } + + with mock.patch.object(helpers.ee, 'Image', FakeImage), mock.patch.object( + helpers.ee, 'ImageCollection', FakeImageCollection + ): + params = helpers.extract_grid_params( + image, + getinfo_kwargs={'max_retries': 7, 'initial_delay': 1300}, + ) + + kwargs = mock_robust_call.call_args.kwargs + self.assertEqual(kwargs['catch'], helpers.ee.ee_exception.EEException) + self.assertEqual(kwargs['max_retries'], 7) + self.assertEqual(kwargs['initial_delay'], 1300) + self.assertEqual(params['crs'], 'EPSG:4326') + self.assertEqual(params['shape_2d'], (4, 5)) + if __name__ == '__main__': absltest.main() diff --git a/xee/helpers.py b/xee/helpers.py index c09f5a9..c4e0bcb 100644 --- a/xee/helpers.py +++ b/xee/helpers.py @@ -41,12 +41,18 @@ from pyproj import Transformer import shapely from shapely.ops import transform +from xee import retries TransformType = tuple[float, float, float, float, float, float] ShapeType = tuple[int, int] ScalingType = tuple[float, float] +GETINFO_KWARGS: dict[str, int] = { + 'max_retries': 6, + 'initial_delay': 1000, +} + class PixelGridParams(TypedDict): """TypedDict describing pixel grid parameters. @@ -189,6 +195,7 @@ def fit_geometry( def extract_grid_params( ee_obj: Union[ee.Image, ee.ImageCollection], + getinfo_kwargs: dict[str, int] | None = None, ) -> PixelGridParams: """Return native pixel grid parameters for an EE Image or ImageCollection. @@ -198,6 +205,9 @@ def extract_grid_params( Args: ee_obj: ``ee.Image`` or ``ee.ImageCollection`` instance. + getinfo_kwargs: Exponential backoff kwargs used for Earth Engine + ``getInfo()`` retries. Supported keys are ``max_retries`` and + ``initial_delay`` (milliseconds). Returns: ``PixelGridParams`` mapping the native CRS, transform, and dimensions. @@ -215,7 +225,13 @@ def extract_grid_params( f'Expected ee.Image or ee.ImageCollection, got {type(ee_obj)}' ) - first_band_info = img_obj.select(0).getInfo()['bands'][0] + getinfo_kwargs = {**GETINFO_KWARGS, **(getinfo_kwargs or {})} + info = retries.robust_call( + lambda: img_obj.select(0).getInfo(), + catch=ee.ee_exception.EEException, + **getinfo_kwargs, + ) + first_band_info = info['bands'][0] return dict( crs=first_band_info['crs'], diff --git a/xee/retries.py b/xee/retries.py new file mode 100644 index 0000000..6bb5ba0 --- /dev/null +++ b/xee/retries.py @@ -0,0 +1,66 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Retry helpers for Earth Engine RPC calls.""" + +from __future__ import annotations + +from collections.abc import Callable +import logging +import time +import traceback +from typing import TypeVar + +import numpy as np + + +logger = logging.getLogger(__name__) + +T = TypeVar('T') + + +def robust_call( + fn: Callable[[], T], + catch: type[Exception] | tuple[type[Exception], ...] = Exception, + max_retries: int = 6, + initial_delay: int = 500, +) -> T: + """Execute a callable with exponential backoff and jitter. + + Args: + fn: Callable to execute. + catch: Exception type(s) to retry. + max_retries: Maximum number of retry attempts. + initial_delay: Initial retry delay in milliseconds. + + Returns: + The return value of ``fn``. + """ + assert max_retries >= 0 + for n in range(max_retries + 1): + try: + return fn() + except catch: + if n == max_retries: + raise + base_delay = initial_delay * 2**n + jitter = np.random.randint(base_delay) if base_delay > 0 else 0 + next_delay = base_delay + jitter + msg = ( + f'call failed, waiting {next_delay} ms before trying again ' + f'({max_retries - n} tries remaining). ' + f'Full traceback: {traceback.format_exc()}' + ) + logger.debug(msg) + time.sleep(1e-3 * next_delay) \ No newline at end of file