From 780074e889857fdc07a163880074db0d49a88521 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 10 Nov 2025 16:11:12 +0000 Subject: [PATCH 01/54] Add agentic pipeline runner scaffold Include callbacks, abort handling, and logging Cover runner lifecycle with new unit tests --- tools/agentic_import/pipeline.py | 127 ++++++++++++++++++++++++++ tools/agentic_import/pipeline_test.py | 77 ++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 tools/agentic_import/pipeline.py create mode 100644 tools/agentic_import/pipeline_test.py diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py new file mode 100644 index 0000000000..ecd9211331 --- /dev/null +++ b/tools/agentic_import/pipeline.py @@ -0,0 +1,127 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Generic building blocks for lightweight agentic pipelines. +""" + +from __future__ import annotations + +import abc +from dataclasses import dataclass +from typing import Sequence + +from absl import logging + + +class PipelineAbort(Exception): + """Raised when pipeline execution should stop early for any reason.""" + + +class Step(abc.ABC): + """Abstract pipeline step interface.""" + + @property + @abc.abstractmethod + def name(self) -> str: + """Human friendly identifier used for logging.""" + + @property + @abc.abstractmethod + def version(self) -> int: + """Version used for invalidation decisions.""" + + @abc.abstractmethod + def run(self) -> None: + """Execute the step.""" + + @abc.abstractmethod + def dry_run(self) -> str: + """Return a read-only preview of the work to be done.""" + + +class BaseStep(Step, abc.ABC): + """Helper base class that stores mandatory metadata.""" + + def __init__(self, *, name: str, version: int) -> None: + if not name: + raise ValueError("step requires a name") + self._name = name + self._version = version + + @property + def name(self) -> str: + return self._name + + @property + def version(self) -> int: + return self._version + + +@dataclass(frozen=True) +class Pipeline: + steps: Sequence[Step] + + def get_steps(self) -> list[Step]: + return list(self.steps) + + +class PipelineCallback: + """Lifecycle hooks consumed by the runner; defaults are no-ops.""" + + def before_step(self, step: Step) -> None: + del step + + def after_step(self, step: Step, *, error: Exception | None = None) -> None: + del step, error + + +@dataclass(frozen=True) +class RunnerConfig: + """Placeholder for future runner toggles.""" + + +class PipelineRunner: + + def __init__(self, config: RunnerConfig | None = None) -> None: + self._config = config or RunnerConfig() + + def run(self, + pipeline: Pipeline, + callback: PipelineCallback | None = None) -> None: + current_step: Step | None = None + steps = pipeline.get_steps() + logging.info("Starting pipeline with %d steps", len(steps)) + try: + for step in steps: + current_step = step + logging.info("Preparing step %s (v%d)", step.name, step.version) + if callback: + callback.before_step(step) + try: + step.run() + except PipelineAbort as exc: + if callback: + callback.after_step(step, error=exc) + raise + except Exception as exc: # pylint: disable=broad-except + if callback: + callback.after_step(step, error=exc) + logging.exception("Step %s failed", step.name) + raise + if callback: + callback.after_step(step) + logging.info("Finished step %s", step.name) + logging.info("Pipeline completed") + except PipelineAbort as exc: + name = current_step.name if current_step else "" + logging.info("Pipeline aborted at %s", name) diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py new file mode 100644 index 0000000000..1bd66aee15 --- /dev/null +++ b/tools/agentic_import/pipeline_test.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Unit tests for the Phase 0 pipeline skeleton.""" + +import os +import sys +import unittest + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) + +from pipeline import ( # pylint: disable=import-error + BaseStep, Pipeline, PipelineAbort, PipelineCallback, PipelineRunner, + RunnerConfig, Step, +) + + +class _TrackingStep(BaseStep): + + def __init__(self, name: str, events: list[str]) -> None: + super().__init__(name=name, version=1) + self._events = events + self.executed = False + + def run(self) -> None: + self.executed = True + self._events.append(f"run:{self.name}") + + def dry_run(self) -> str: + return "" + + +class PipelineRunnerTest(unittest.TestCase): + + def _build_pipeline(self, events: list[str]) -> Pipeline: + step_one = _TrackingStep("one", events) + step_two = _TrackingStep("two", events) + return Pipeline(steps=[step_one, step_two]) + + def test_on_before_step_runs_before_each_step(self) -> None: + events: list[str] = [] + + class RecordingCallback(PipelineCallback): + + def before_step(self, step: Step) -> None: + events.append(f"before:{step.name}") + + pipeline = self._build_pipeline(events) + PipelineRunner(RunnerConfig()).run(pipeline, RecordingCallback()) + + self.assertEqual( + events, + [ + "before:one", + "run:one", + "before:two", + "run:two", + ], + ) + + def test_pipeline_abort_skips_downstream_steps(self) -> None: + events: list[str] = [] + pipeline = self._build_pipeline(events) + runner = PipelineRunner(RunnerConfig()) + + class AbortOnSecond(PipelineCallback): + + def before_step(self, step: Step) -> None: + if step.name == "two": + raise PipelineAbort("stop") + + runner.run(pipeline, AbortOnSecond()) + + self.assertEqual(events, ["run:one"]) + + +if __name__ == "__main__": + unittest.main() From c125d782f4020a10081a6bdc96fdc17bb8393322 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 10 Nov 2025 17:06:02 +0000 Subject: [PATCH 02/54] Add SDMX pipeline state callback and tests Capture step timing and errors in JSON state files --- tools/agentic_import/pipeline_test.py | 15 ++ tools/agentic_import/sdmx_import_pipeline.py | 112 ++++++++++++++ .../sdmx_import_pipeline_test.py | 137 ++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 tools/agentic_import/sdmx_import_pipeline.py create mode 100644 tools/agentic_import/sdmx_import_pipeline_test.py diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index 1bd66aee15..6ba51fcbcb 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -1,4 +1,19 @@ #!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Unit tests for the Phase 0 pipeline skeleton.""" import os diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py new file mode 100644 index 0000000000..0107ce381f --- /dev/null +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -0,0 +1,112 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the SDMX agentic import pipeline.""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Callable + +from absl import logging + +from tools.agentic_import.pipeline import PipelineCallback, Step + + +def _format_time(value: datetime) -> str: + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + return value.isoformat() + + +@dataclass +class StepState: + version: int + status: str + started_at: str + ended_at: str + duration_s: float + message: str | None = None + + +@dataclass +class PipelineState: + run_id: str + critical_input_hash: str + command: str + updated_at: str + steps: dict[str, StepState] = field(default_factory=dict) + + +class JSONStateCallback(PipelineCallback): + """Persists pipeline progress to the SDMX state file. + + The callback is intentionally unaware of planning concerns. The CLI computes + identifiers such as run_id and critical_input_hash before invoking the + runner, then instantiates this callback with the desired destination file. + """ + + def __init__(self, + *, + state_path: str | Path, + run_id: str, + critical_input_hash: str, + command: str, + now_fn: Callable[[], datetime] | None = None) -> None: + self._state_path = Path(state_path) + self._now_fn = now_fn or (lambda: datetime.now(timezone.utc)) + self._state = PipelineState( + run_id=run_id, + critical_input_hash=critical_input_hash, + command=command, + updated_at=_format_time(self._now()), + ) + self._step_start_times: dict[str, datetime] = {} + self._state_path.parent.mkdir(parents=True, exist_ok=True) + logging.info("JSON state will be written to %s", self._state_path) + + def before_step(self, step: Step) -> None: + started_at = self._now() + self._step_start_times[step.name] = started_at + + def after_step(self, step: Step, *, error: Exception | None = None) -> None: + ended_at = self._now() + started_at = self._step_start_times.pop(step.name, None) + if started_at is None: + started_at = ended_at + duration = max(0.0, (ended_at - started_at).total_seconds()) + step_state = StepState( + version=step.version, + status="failed" if error else "succeeded", + started_at=_format_time(started_at), + ended_at=_format_time(ended_at), + duration_s=duration, + message=str(error) or error.__class__.__name__ if error else None, + ) + self._state.steps[step.name] = step_state + self._state.updated_at = step_state.ended_at + self._write_state() + + def _now(self) -> datetime: + return self._now_fn() + + def _write_state(self) -> None: + temp_path = self._state_path.with_suffix(self._state_path.suffix + ".tmp") + with temp_path.open("w", encoding="utf-8") as fp: + json.dump(asdict(self._state), fp, indent=2, sort_keys=True) + fp.write("\n") + temp_path.replace(self._state_path) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py new file mode 100644 index 0000000000..792aac3f01 --- /dev/null +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the Phase 1 JSON state callback.""" + +from __future__ import annotations + +import json +import os +import sys +import tempfile +import unittest +from datetime import datetime, timedelta, timezone + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_REPO_ROOT = os.path.dirname(_SCRIPT_DIR) +_PROJECT_ROOT = os.path.dirname(_REPO_ROOT) +for path in (_PROJECT_ROOT,): + if path not in sys.path: + sys.path.append(path) + +from tools.agentic_import.pipeline import ( # pylint: disable=import-error + BaseStep, + Pipeline, + PipelineRunner, + RunnerConfig, +) +from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error + JSONStateCallback, +) + + +class _IncrementingClock: + + def __init__(self, start: datetime, step: timedelta) -> None: + self._value = start + self._step = step + self._first_call = True + + def __call__(self) -> datetime: + if self._first_call: + self._first_call = False + return self._value + self._value = self._value + self._step + return self._value + + +class _RecordingStep(BaseStep): + + def __init__(self, name: str, *, should_fail: bool = False) -> None: + super().__init__(name=name, version=1) + self._should_fail = should_fail + + def run(self) -> None: + if self._should_fail: + raise ValueError("boom") + + def dry_run(self) -> str: + return "noop" + + +class JSONStateCallbackTest(unittest.TestCase): + + def _build_callback(self, *, tmpdir: str, + clock: _IncrementingClock) -> JSONStateCallback: + state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json") + return JSONStateCallback( + state_path=state_path, + run_id="demo", + critical_input_hash="abc123", + command="python run", + now_fn=clock, + ) + + def test_successful_step_persists_expected_schema(self) -> None: + clock = _IncrementingClock( + datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc), + timedelta(seconds=5)) + with tempfile.TemporaryDirectory() as tmpdir: + callback = self._build_callback(tmpdir=tmpdir, clock=clock) + pipeline = Pipeline(steps=[_RecordingStep("download.download-data")]) + runner = PipelineRunner(RunnerConfig()) + runner.run(pipeline, callback) + + state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json") + with open(state_path, encoding="utf-8") as fp: + state = json.load(fp) + + step_state = state["steps"]["download.download-data"] + self.assertEqual(state["run_id"], "demo") + self.assertEqual(state["critical_input_hash"], "abc123") + self.assertEqual(step_state["status"], "succeeded") + self.assertIn("started_at", step_state) + self.assertIn("ended_at", step_state) + self.assertAlmostEqual(step_state["duration_s"], 5.0) + self.assertIn("message", step_state) + self.assertIsNone(step_state["message"]) + self.assertEqual(state["updated_at"], step_state["ended_at"]) + + def test_failed_step_records_error_and_persists_file(self) -> None: + clock = _IncrementingClock( + datetime(2025, 1, 2, 0, 0, tzinfo=timezone.utc), + timedelta(seconds=7)) + with tempfile.TemporaryDirectory() as tmpdir: + callback = self._build_callback(tmpdir=tmpdir, clock=clock) + pipeline = Pipeline( + steps=[_RecordingStep("sample.create-sample", should_fail=True)]) + runner = PipelineRunner(RunnerConfig()) + + with self.assertRaisesRegex(ValueError, "boom"): + runner.run(pipeline, callback) + + state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json") + with open(state_path, encoding="utf-8") as fp: + state = json.load(fp) + + step_state = state["steps"]["sample.create-sample"] + self.assertEqual(step_state["status"], "failed") + self.assertIn("boom", step_state["message"]) + self.assertAlmostEqual(step_state["duration_s"], 7.0) + + +if __name__ == "__main__": + unittest.main() From 5271fd3837f4e79b3993cacbcb72b514d9d368a4 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 10 Nov 2025 17:07:59 +0000 Subject: [PATCH 03/54] Polish agentic import docstrings and format Align module documentation and line wrapping with current style --- tools/agentic_import/pipeline_test.py | 3 +-- tools/agentic_import/sdmx_import_pipeline.py | 4 ++-- .../sdmx_import_pipeline_test.py | 19 ++++++++----------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index 6ba51fcbcb..b68864d1fa 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -13,8 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -"""Unit tests for the Phase 0 pipeline skeleton.""" +"""Unit tests for the agentic pipeline skeleton.""" import os import sys diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 0107ce381f..c96b8d6f72 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """Helpers for the SDMX agentic import pipeline.""" from __future__ import annotations @@ -105,7 +104,8 @@ def _now(self) -> datetime: return self._now_fn() def _write_state(self) -> None: - temp_path = self._state_path.with_suffix(self._state_path.suffix + ".tmp") + temp_path = self._state_path.with_suffix(self._state_path.suffix + + ".tmp") with temp_path.open("w", encoding="utf-8") as fp: json.dump(asdict(self._state), fp, indent=2, sort_keys=True) fp.write("\n") diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 792aac3f01..a9e94e5b0d 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -13,8 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -"""Unit tests for the Phase 1 JSON state callback.""" +"""Unit tests for SDMX pipeline helpers.""" from __future__ import annotations @@ -33,14 +32,10 @@ sys.path.append(path) from tools.agentic_import.pipeline import ( # pylint: disable=import-error - BaseStep, - Pipeline, - PipelineRunner, - RunnerConfig, + BaseStep, Pipeline, PipelineRunner, RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error - JSONStateCallback, -) + JSONStateCallback,) class _IncrementingClock: @@ -91,7 +86,8 @@ def test_successful_step_persists_expected_schema(self) -> None: timedelta(seconds=5)) with tempfile.TemporaryDirectory() as tmpdir: callback = self._build_callback(tmpdir=tmpdir, clock=clock) - pipeline = Pipeline(steps=[_RecordingStep("download.download-data")]) + pipeline = Pipeline( + steps=[_RecordingStep("download.download-data")]) runner = PipelineRunner(RunnerConfig()) runner.run(pipeline, callback) @@ -116,8 +112,9 @@ def test_failed_step_records_error_and_persists_file(self) -> None: timedelta(seconds=7)) with tempfile.TemporaryDirectory() as tmpdir: callback = self._build_callback(tmpdir=tmpdir, clock=clock) - pipeline = Pipeline( - steps=[_RecordingStep("sample.create-sample", should_fail=True)]) + pipeline = Pipeline(steps=[ + _RecordingStep("sample.create-sample", should_fail=True) + ]) runner = PipelineRunner(RunnerConfig()) with self.assertRaisesRegex(ValueError, "boom"): From d6441f669e0a16305eaecc6d3259b3aa60d50fe6 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 11 Nov 2025 06:02:25 +0000 Subject: [PATCH 04/54] Improve agentic import runner observability Add state handler persistence and dataclasses-json dependency --- requirements_all.txt | 1 + tools/agentic_import/pipeline.py | 27 ++-- tools/agentic_import/pipeline_test.py | 59 +++++++++ tools/agentic_import/sdmx_import_pipeline.py | 73 ++++------ .../sdmx_import_pipeline_test.py | 72 ++++++++-- tools/agentic_import/state_handler.py | 125 ++++++++++++++++++ tools/agentic_import/state_handler_test.py | 73 ++++++++++ 7 files changed, 358 insertions(+), 72 deletions(-) create mode 100644 tools/agentic_import/state_handler.py create mode 100644 tools/agentic_import/state_handler_test.py diff --git a/requirements_all.txt b/requirements_all.txt index 9ce4e5a975..6983f07837 100644 --- a/requirements_all.txt +++ b/requirements_all.txt @@ -9,6 +9,7 @@ absl-py chembl-webresource-client +dataclasses-json deepdiff earthengine-api flask_restful diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index ecd9211331..cad62d89c0 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -79,9 +79,11 @@ class PipelineCallback: """Lifecycle hooks consumed by the runner; defaults are no-ops.""" def before_step(self, step: Step) -> None: + """Called immediately before `step.run()`; raising an error skips execution.""" del step def after_step(self, step: Step, *, error: Exception | None = None) -> None: + """Runs once per step after `step.run()` succeeds or raises.""" del step, error @@ -100,28 +102,25 @@ def run(self, callback: PipelineCallback | None = None) -> None: current_step: Step | None = None steps = pipeline.get_steps() - logging.info("Starting pipeline with %d steps", len(steps)) + logging.info(f"Starting pipeline with {len(steps)} steps") try: for step in steps: current_step = step - logging.info("Preparing step %s (v%d)", step.name, step.version) + logging.info(f"Preparing step {step.name} (v{step.version})") if callback: callback.before_step(step) + error: Exception | None = None try: step.run() - except PipelineAbort as exc: - if callback: - callback.after_step(step, error=exc) - raise except Exception as exc: # pylint: disable=broad-except - if callback: - callback.after_step(step, error=exc) - logging.exception("Step %s failed", step.name) + error = exc + logging.exception(f"Step {step.name} failed") raise - if callback: - callback.after_step(step) - logging.info("Finished step %s", step.name) + finally: + if callback: + callback.after_step(step, error=error) + logging.info(f"Finished step {step.name}") logging.info("Pipeline completed") - except PipelineAbort as exc: + except PipelineAbort: name = current_step.name if current_step else "" - logging.info("Pipeline aborted at %s", name) + logging.info(f"Pipeline aborted at {name}") diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index b68864d1fa..5b5ccfab07 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -43,6 +43,18 @@ def dry_run(self) -> str: return "" +class _FailingStep(BaseStep): + + def __init__(self, *, name: str, version: int) -> None: + super().__init__(name=name, version=version) + + def run(self) -> None: + raise ValueError("boom") + + def dry_run(self) -> str: + return "" + + class PipelineRunnerTest(unittest.TestCase): def _build_pipeline(self, events: list[str]) -> Pipeline: @@ -85,6 +97,53 @@ def before_step(self, step: Step) -> None: runner.run(pipeline, AbortOnSecond()) self.assertEqual(events, ["run:one"]) + # PipelineAbort is swallowed by the runner, so execution simply stops. + + def test_before_step_exception_skips_after_step(self) -> None: + events: list[str] = [] + pipeline = Pipeline(steps=[_TrackingStep("one", events)]) + runner = PipelineRunner(RunnerConfig()) + + class RecordingCallback(PipelineCallback): + + def before_step(self, step: Step) -> None: + events.append(f"before:{step.name}") + raise RuntimeError("boom") + + def after_step(self, + step: Step, + *, + error: Exception | None = None) -> None: + del step, error + events.append("after-called") + + with self.assertRaises(RuntimeError): + runner.run(pipeline, RecordingCallback()) + + self.assertEqual(events, ["before:one"]) + + def test_after_step_receives_error_when_step_fails(self) -> None: + + class RecordingCallback(PipelineCallback): + + def __init__(self) -> None: + self.after_calls: list[tuple[str, str | None]] = [] + + def after_step(self, + step: Step, + *, + error: Exception | None = None) -> None: + name = step.name + error_name = type(error).__name__ if error else None + self.after_calls.append((name, error_name)) + + callback = RecordingCallback() + pipeline = Pipeline(steps=[_FailingStep(name="fail-step", version=1)]) + + with self.assertRaises(ValueError): + PipelineRunner(RunnerConfig()).run(pipeline, callback) + + self.assertEqual(callback.after_calls, [("fail-step", "ValueError")]) if __name__ == "__main__": diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index c96b8d6f72..e49a8e19cb 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -15,15 +15,13 @@ from __future__ import annotations -import json -from dataclasses import asdict, dataclass, field from datetime import datetime, timezone -from pathlib import Path from typing import Callable from absl import logging -from tools.agentic_import.pipeline import PipelineCallback, Step +from tools.agentic_import.pipeline import PipelineAbort, PipelineCallback, Step +from tools.agentic_import.state_handler import StateHandler, StepState def _format_time(value: datetime) -> str: @@ -32,51 +30,29 @@ def _format_time(value: datetime) -> str: return value.isoformat() -@dataclass -class StepState: - version: int - status: str - started_at: str - ended_at: str - duration_s: float - message: str | None = None - - -@dataclass -class PipelineState: - run_id: str - critical_input_hash: str - command: str - updated_at: str - steps: dict[str, StepState] = field(default_factory=dict) - - class JSONStateCallback(PipelineCallback): - """Persists pipeline progress to the SDMX state file. + """Persists pipeline progress to the SDMX state file via StateHandler. - The callback is intentionally unaware of planning concerns. The CLI computes - identifiers such as run_id and critical_input_hash before invoking the - runner, then instantiates this callback with the desired destination file. + This callback assumes a single process owns the state file for the lifetime + of the run. The CLI or builder sets run metadata up-front; this class only + mutates state after a step executes. """ def __init__(self, *, - state_path: str | Path, + state_handler: StateHandler, run_id: str, critical_input_hash: str, command: str, now_fn: Callable[[], datetime] | None = None) -> None: - self._state_path = Path(state_path) + self._handler = state_handler self._now_fn = now_fn or (lambda: datetime.now(timezone.utc)) - self._state = PipelineState( - run_id=run_id, - critical_input_hash=critical_input_hash, - command=command, - updated_at=_format_time(self._now()), - ) + self._state = self._handler.get_state() + self._state.run_id = run_id + self._state.critical_input_hash = critical_input_hash + self._state.command = command self._step_start_times: dict[str, datetime] = {} - self._state_path.parent.mkdir(parents=True, exist_ok=True) - logging.info("JSON state will be written to %s", self._state_path) + logging.info(f"JSON state will be written to {self._handler.path}") def before_step(self, step: Step) -> None: started_at = self._now() @@ -88,25 +64,28 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None: if started_at is None: started_at = ended_at duration = max(0.0, (ended_at - started_at).total_seconds()) + if isinstance(error, PipelineAbort): + logging.info( + f"Skipping state update for {step.name} due to pipeline abort") + return + if error: + message = str(error) or error.__class__.__name__ + else: + message = None + # Step stats are persisted only after the step finishes; steps can still + # be skipped after their before_step callback runs, so we leave skipped + # steps untouched to preserve prior state. step_state = StepState( version=step.version, status="failed" if error else "succeeded", started_at=_format_time(started_at), ended_at=_format_time(ended_at), duration_s=duration, - message=str(error) or error.__class__.__name__ if error else None, + message=message, ) self._state.steps[step.name] = step_state self._state.updated_at = step_state.ended_at - self._write_state() + self._handler.save_state() def _now(self) -> datetime: return self._now_fn() - - def _write_state(self) -> None: - temp_path = self._state_path.with_suffix(self._state_path.suffix + - ".tmp") - with temp_path.open("w", encoding="utf-8") as fp: - json.dump(asdict(self._state), fp, indent=2, sort_keys=True) - fp.write("\n") - temp_path.replace(self._state_path) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index a9e94e5b0d..259db3e979 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -32,10 +32,11 @@ sys.path.append(path) from tools.agentic_import.pipeline import ( # pylint: disable=import-error - BaseStep, Pipeline, PipelineRunner, RunnerConfig, + BaseStep, Pipeline, PipelineAbort, PipelineRunner, RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error JSONStateCallback,) +from tools.agentic_import.state_handler import StateHandler # pylint: disable=import-error class _IncrementingClock: @@ -69,30 +70,32 @@ def dry_run(self) -> str: class JSONStateCallbackTest(unittest.TestCase): - def _build_callback(self, *, tmpdir: str, - clock: _IncrementingClock) -> JSONStateCallback: + def _build_callback( + self, *, tmpdir: str, clock: _IncrementingClock + ) -> tuple[JSONStateCallback, StateHandler]: state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json") - return JSONStateCallback( - state_path=state_path, + handler = StateHandler(state_path=state_path, dataset_prefix="demo") + callback = JSONStateCallback( + state_handler=handler, run_id="demo", critical_input_hash="abc123", command="python run", now_fn=clock, ) + return callback, handler def test_successful_step_persists_expected_schema(self) -> None: clock = _IncrementingClock( datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc), timedelta(seconds=5)) with tempfile.TemporaryDirectory() as tmpdir: - callback = self._build_callback(tmpdir=tmpdir, clock=clock) + callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock) pipeline = Pipeline( steps=[_RecordingStep("download.download-data")]) runner = PipelineRunner(RunnerConfig()) runner.run(pipeline, callback) - state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json") - with open(state_path, encoding="utf-8") as fp: + with open(handler.path, encoding="utf-8") as fp: state = json.load(fp) step_state = state["steps"]["download.download-data"] @@ -111,7 +114,7 @@ def test_failed_step_records_error_and_persists_file(self) -> None: datetime(2025, 1, 2, 0, 0, tzinfo=timezone.utc), timedelta(seconds=7)) with tempfile.TemporaryDirectory() as tmpdir: - callback = self._build_callback(tmpdir=tmpdir, clock=clock) + callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock) pipeline = Pipeline(steps=[ _RecordingStep("sample.create-sample", should_fail=True) ]) @@ -120,8 +123,7 @@ def test_failed_step_records_error_and_persists_file(self) -> None: with self.assertRaisesRegex(ValueError, "boom"): runner.run(pipeline, callback) - state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json") - with open(state_path, encoding="utf-8") as fp: + with open(handler.path, encoding="utf-8") as fp: state = json.load(fp) step_state = state["steps"]["sample.create-sample"] @@ -129,6 +131,54 @@ def test_failed_step_records_error_and_persists_file(self) -> None: self.assertIn("boom", step_state["message"]) self.assertAlmostEqual(step_state["duration_s"], 7.0) + def test_abort_skips_state_persistence(self) -> None: + clock = _IncrementingClock( + datetime(2025, 1, 3, 0, 0, tzinfo=timezone.utc), + timedelta(seconds=3)) + with tempfile.TemporaryDirectory() as tmpdir: + state_dir = os.path.join(tmpdir, ".datacommons") + os.makedirs(state_dir, exist_ok=True) + state_path = os.path.join(state_dir, "demo.state.json") + previous = { + "run_id": "previous", + "critical_input_hash": "old", + "command": "old command", + "updated_at": "2025-01-01T00:00:00Z", + "steps": { + "existing.step": { + "version": 1, + "status": "succeeded", + "started_at": "2025-01-01T00:00:00Z", + "ended_at": "2025-01-01T00:05:00Z", + "duration_s": 300.0, + "message": None, + } + }, + } + with open(state_path, "w", encoding="utf-8") as fp: + json.dump(previous, fp) + callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock) + + class _AbortStep(BaseStep): + + def __init__(self) -> None: + super().__init__(name="download.download-data", version=1) + + def run(self) -> None: + raise PipelineAbort("user requested stop") + + def dry_run(self) -> str: + return "noop" + + pipeline = Pipeline(steps=[_AbortStep()]) + runner = PipelineRunner(RunnerConfig()) + runner.run(pipeline, callback) + + with open(handler.path, encoding="utf-8") as fp: + state = json.load(fp) + + self.assertEqual(state, previous) + if __name__ == "__main__": unittest.main() diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py new file mode 100644 index 0000000000..cb2cb85b02 --- /dev/null +++ b/tools/agentic_import/state_handler.py @@ -0,0 +1,125 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""State file helpers shared by the SDMX agentic pipeline components. + +The handler centralizes JSON persistence so callers (builder, callbacks) can +operate on an in-memory `PipelineState`. This implementation assumes a single +process has exclusive ownership of the state file for the duration of a run. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +from absl import logging +from dataclasses_json import dataclass_json + + +@dataclass_json +@dataclass +class StepState: + version: int + status: str + started_at: str + ended_at: str + duration_s: float + message: str | None = None + + +@dataclass_json +@dataclass +class PipelineState: + run_id: str + critical_input_hash: str + command: str + updated_at: str + steps: dict[str, StepState] = field(default_factory=dict) + + +class StateHandler: + """Minimal state manager that owns JSON file I/O.""" + + def __init__(self, state_path: str | Path, dataset_prefix: str) -> None: + self._state_path = Path(state_path) + self._dataset_prefix = dataset_prefix + self._state: PipelineState | None = None + + @property + def path(self) -> Path: + """Returns the backing state file path.""" + return self._state_path + + def get_state(self) -> PipelineState: + if self._state is None: + self._state = self._load_or_init() + return self._state + + def save_state(self) -> None: + state = self.get_state() + self._write_state(state) + + def _load_or_init(self) -> PipelineState: + path = self._state_path + path.parent.mkdir(parents=True, exist_ok=True) + if not path.exists(): + state = self._empty_state() + logging.info(f"Creating new state file at {path}") + self._write_state(state) + return state + try: + with path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + state = PipelineState.from_dict(data) + if not state.run_id: + state.run_id = self._dataset_prefix + return state + except (OSError, json.JSONDecodeError, ValueError, TypeError) as exc: + logging.warning(f"Failed to load state file {path}: {exc}") + self._backup_bad_file() + state = self._empty_state() + self._write_state(state) + return state + + def _write_state(self, state: PipelineState) -> None: + directory = self._state_path.parent + directory.mkdir(parents=True, exist_ok=True) + payload = json.dumps(asdict(state), indent=2, sort_keys=True) + "\n" + tmp_path = self._state_path.with_suffix(".tmp") + tmp_path.write_text(payload, encoding="utf-8") + tmp_path.replace(self._state_path) + + def _empty_state(self) -> PipelineState: + return PipelineState( + run_id=self._dataset_prefix, + critical_input_hash="", + command="", + updated_at="", + ) + + def _backup_bad_file(self) -> None: + path = self._state_path + if not path.exists(): + return + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + backup_name = f"{path.name}.bad.{timestamp}.bak" + backup_path = path.with_name(backup_name) + try: + path.replace(backup_path) + logging.warning(f"Backed up corrupt state to {backup_path}") + except OSError as exc: + logging.warning( + f"Failed to backup corrupt state file {path}: {exc}") diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py new file mode 100644 index 0000000000..b8af010fd7 --- /dev/null +++ b/tools/agentic_import/state_handler_test.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for the SDMX state handler.""" + +from __future__ import annotations + +import json +import os +import sys +import tempfile +import unittest + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +if _SCRIPT_DIR not in sys.path: + sys.path.append(_SCRIPT_DIR) + +from state_handler import StateHandler # pylint: disable=import-error + + +class StateHandlerTest(unittest.TestCase): + + def test_missing_file_creates_empty_state(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "demo.state.json") + handler = StateHandler(state_path=path, dataset_prefix="demo") + + state = handler.get_state() + + self.assertTrue(os.path.exists(path)) + self.assertEqual(state.run_id, "demo") + self.assertEqual(state.steps, {}) + + with open(path, encoding="utf-8") as fp: + data = json.load(fp) + self.assertEqual(data["run_id"], "demo") + self.assertEqual(data["steps"], {}) + + def test_corrupt_file_creates_backup_and_resets_state(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "demo.state.json") + with open(path, "w", encoding="utf-8") as fp: + fp.write("{not-json}") + + handler = StateHandler(state_path=path, dataset_prefix="demo") + state = handler.get_state() + + backups = [ + name for name in os.listdir(tmpdir) + if name.startswith("demo.state.json.bad.") + ] + self.assertEqual(state.steps, {}) + self.assertGreaterEqual(len(backups), 1) + + with open(path, encoding="utf-8") as fp: + data = json.load(fp) + self.assertEqual(data["steps"], {}) + + +if __name__ == "__main__": + unittest.main() From 593db877b32f89fdf353f243e9023dda6baa9c65 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 11 Nov 2025 06:38:47 +0000 Subject: [PATCH 05/54] Add interactive confirmation to SDMX runner Add CompositeCallback support for callback stacking Cover interactive and factory behavior with tests --- tools/agentic_import/pipeline.py | 15 ++++ tools/agentic_import/pipeline_test.py | 43 +++++++++- tools/agentic_import/sdmx_import_pipeline.py | 38 +++++++- .../sdmx_import_pipeline_test.py | 86 ++++++++++++++++++- 4 files changed, 177 insertions(+), 5 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index cad62d89c0..ec3d1b2457 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -87,6 +87,21 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None: del step, error +class CompositeCallback(PipelineCallback): + """Fans out events to child callbacks in order.""" + + def __init__(self, callbacks: Sequence[PipelineCallback]) -> None: + self._callbacks = list(callbacks) + + def before_step(self, step: Step) -> None: + for callback in self._callbacks: + callback.before_step(step) + + def after_step(self, step: Step, *, error: Exception | None = None) -> None: + for callback in self._callbacks: + callback.after_step(step, error=error) + + @dataclass(frozen=True) class RunnerConfig: """Placeholder for future runner toggles.""" diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index 5b5ccfab07..67c25ada12 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -23,8 +23,8 @@ sys.path.append(_SCRIPT_DIR) from pipeline import ( # pylint: disable=import-error - BaseStep, Pipeline, PipelineAbort, PipelineCallback, PipelineRunner, - RunnerConfig, Step, + BaseStep, CompositeCallback, Pipeline, PipelineAbort, PipelineCallback, + PipelineRunner, RunnerConfig, Step, ) @@ -146,5 +146,44 @@ def after_step(self, self.assertEqual(callback.after_calls, [("fail-step", "ValueError")]) +class CompositeCallbackTest(unittest.TestCase): + + def test_callbacks_run_in_order_for_each_hook(self) -> None: + events: list[str] = [] + + class RecordingCallback(PipelineCallback): + + def __init__(self, label: str) -> None: + self._label = label + + def before_step(self, step: Step) -> None: + events.append(f"{self._label}:before:{step.name}") + + def after_step(self, + step: Step, + *, + error: Exception | None = None) -> None: + del error + events.append(f"{self._label}:after:{step.name}") + + composite = CompositeCallback( + [RecordingCallback("first"), + RecordingCallback("second")]) + step = _TrackingStep("composite", events) + + composite.before_step(step) + composite.after_step(step) + + self.assertEqual( + events, + [ + "first:before:composite", + "second:before:composite", + "first:after:composite", + "second:after:composite", + ], + ) + + if __name__ == "__main__": unittest.main() diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index e49a8e19cb..43abda0612 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -20,7 +20,8 @@ from absl import logging -from tools.agentic_import.pipeline import PipelineAbort, PipelineCallback, Step +from tools.agentic_import.pipeline import (CompositeCallback, PipelineAbort, + PipelineCallback, Step) from tools.agentic_import.state_handler import StateHandler, StepState @@ -30,6 +31,20 @@ def _format_time(value: datetime) -> str: return value.isoformat() +class InteractiveCallback(PipelineCallback): + """Prompts the user before each step runs.""" + + def before_step(self, step: Step) -> None: + preview = step.dry_run() + logging.info(f"Dry run for {step.name} (v{step.version}):") + if preview: + logging.info(preview) + prompt = f"Run step {step.name} (v{step.version})? [Y/n] " + response = input(prompt).strip().lower() + if response in ("n", "no"): + raise PipelineAbort("user declined interactive prompt") + + class JSONStateCallback(PipelineCallback): """Persists pipeline progress to the SDMX state file via StateHandler. @@ -89,3 +104,24 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None: def _now(self) -> datetime: return self._now_fn() + + +def build_pipeline_callback( + *, + state_handler: StateHandler, + run_id: str, + critical_input_hash: str, + command: str, + skip_confirmation: bool, + now_fn: Callable[[], datetime] | None = None, +) -> PipelineCallback: + """Constructs the pipeline callback stack for the SDMX runner.""" + json_callback = JSONStateCallback(state_handler=state_handler, + run_id=run_id, + critical_input_hash=critical_input_hash, + command=command, + now_fn=now_fn) + if skip_confirmation: + return json_callback + interactive = InteractiveCallback() + return CompositeCallback([interactive, json_callback]) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 259db3e979..7133d1b2b6 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -23,6 +23,9 @@ import tempfile import unittest from datetime import datetime, timedelta, timezone +from unittest import mock + +from absl import logging _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) _REPO_ROOT = os.path.dirname(_SCRIPT_DIR) @@ -32,10 +35,11 @@ sys.path.append(path) from tools.agentic_import.pipeline import ( # pylint: disable=import-error - BaseStep, Pipeline, PipelineAbort, PipelineRunner, RunnerConfig, + BaseStep, CompositeCallback, Pipeline, PipelineAbort, PipelineRunner, + RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error - JSONStateCallback,) + InteractiveCallback, JSONStateCallback, build_pipeline_callback) from tools.agentic_import.state_handler import StateHandler # pylint: disable=import-error @@ -180,5 +184,83 @@ def dry_run(self) -> str: self.assertEqual(state, previous) +class InteractiveCallbackTest(unittest.TestCase): + + def test_prompt_accepts_and_runs_step(self) -> None: + callback = InteractiveCallback() + pipeline = Pipeline(steps=[_RecordingStep("download.preview")]) + runner = PipelineRunner(RunnerConfig()) + + with mock.patch("builtins.input", return_value="y"): + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + runner.run(pipeline, callback) + + self.assertTrue( + any("Dry run for download.preview" in entry + for entry in logs.output)) + + def test_prompt_decline_aborts_pipeline(self) -> None: + events: list[str] = [] + + class _TrackingStep(_RecordingStep): + + def __init__(self) -> None: + super().__init__("sample.interactive") + self.executed = False + + def run(self) -> None: + self.executed = True + super().run() + + def dry_run(self) -> str: + events.append("dry_run") + return "noop" + + callback = InteractiveCallback() + step = _TrackingStep() + pipeline = Pipeline(steps=[step]) + runner = PipelineRunner(RunnerConfig()) + + with mock.patch("builtins.input", return_value="n"): + with self.assertLogs(logging.get_absl_logger(), level="INFO"): + runner.run(pipeline, callback) + + self.assertFalse(step.executed) + self.assertTrue(events) + + +class CallbackFactoryTest(unittest.TestCase): + + def _state_handler_for_tmpdir(self, tmpdir: str) -> StateHandler: + path = os.path.join(tmpdir, ".datacommons", "demo.state.json") + return StateHandler(state_path=path, dataset_prefix="demo") + + def test_skip_confirmation_returns_json_callback(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + handler = self._state_handler_for_tmpdir(tmpdir) + callback = build_pipeline_callback( + state_handler=handler, + run_id="demo", + critical_input_hash="abc", + command="python run", + skip_confirmation=True, + ) + self.assertIsInstance(callback, JSONStateCallback) + + def test_interactive_mode_returns_composite(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + handler = self._state_handler_for_tmpdir(tmpdir) + with mock.patch("builtins.input", return_value="y"): + callback = build_pipeline_callback( + state_handler=handler, + run_id="demo", + critical_input_hash="abc", + command="python run", + skip_confirmation=False, + ) + self.assertIsInstance(callback, CompositeCallback) + + if __name__ == "__main__": unittest.main() From cb78e3813214145a5653c9875b7cb4fb77d91671 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 11 Nov 2025 07:18:09 +0000 Subject: [PATCH 06/54] Log dry-run previews instead of returning text Adjust steps, tests, and the interactive callback for None results --- tools/agentic_import/pipeline.py | 4 ++-- tools/agentic_import/pipeline_test.py | 8 ++++---- tools/agentic_import/sdmx_import_pipeline.py | 4 +--- tools/agentic_import/sdmx_import_pipeline_test.py | 12 ++++++------ 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index ec3d1b2457..6c489e89a4 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -45,8 +45,8 @@ def run(self) -> None: """Execute the step.""" @abc.abstractmethod - def dry_run(self) -> str: - """Return a read-only preview of the work to be done.""" + def dry_run(self) -> None: + """Log a read-only preview of the work to be done.""" class BaseStep(Step, abc.ABC): diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index 67c25ada12..ee19777ef3 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -39,8 +39,8 @@ def run(self) -> None: self.executed = True self._events.append(f"run:{self.name}") - def dry_run(self) -> str: - return "" + def dry_run(self) -> None: + return None class _FailingStep(BaseStep): @@ -51,8 +51,8 @@ def __init__(self, *, name: str, version: int) -> None: def run(self) -> None: raise ValueError("boom") - def dry_run(self) -> str: - return "" + def dry_run(self) -> None: + return None class PipelineRunnerTest(unittest.TestCase): diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 43abda0612..386e56bb6a 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -35,10 +35,8 @@ class InteractiveCallback(PipelineCallback): """Prompts the user before each step runs.""" def before_step(self, step: Step) -> None: - preview = step.dry_run() logging.info(f"Dry run for {step.name} (v{step.version}):") - if preview: - logging.info(preview) + step.dry_run() prompt = f"Run step {step.name} (v{step.version})? [Y/n] " response = input(prompt).strip().lower() if response in ("n", "no"): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 7133d1b2b6..a48e1b153b 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -68,8 +68,8 @@ def run(self) -> None: if self._should_fail: raise ValueError("boom") - def dry_run(self) -> str: - return "noop" + def dry_run(self) -> None: + logging.info("noop") class JSONStateCallbackTest(unittest.TestCase): @@ -171,8 +171,8 @@ def __init__(self) -> None: def run(self) -> None: raise PipelineAbort("user requested stop") - def dry_run(self) -> str: - return "noop" + def dry_run(self) -> None: + logging.info("noop") pipeline = Pipeline(steps=[_AbortStep()]) runner = PipelineRunner(RunnerConfig()) @@ -213,9 +213,9 @@ def run(self) -> None: self.executed = True super().run() - def dry_run(self) -> str: + def dry_run(self) -> None: events.append("dry_run") - return "noop" + logging.info("noop") callback = InteractiveCallback() step = _TrackingStep() From 746e1305a5b4f83508380e0446eae8f0842defc3 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 11 Nov 2025 11:59:43 +0000 Subject: [PATCH 07/54] Add SDMX pipeline planning --- tools/agentic_import/sdmx_import_pipeline.py | 147 +++++++++++++++++- .../sdmx_import_pipeline_test.py | 140 ++++++++++++++++- 2 files changed, 281 insertions(+), 6 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 386e56bb6a..a0d74f797d 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -15,14 +15,17 @@ from __future__ import annotations +from dataclasses import dataclass from datetime import datetime, timezone -from typing import Callable +from typing import Callable, Sequence from absl import logging -from tools.agentic_import.pipeline import (CompositeCallback, PipelineAbort, - PipelineCallback, Step) -from tools.agentic_import.state_handler import StateHandler, StepState +from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, + PipelineAbort, PipelineCallback, + Step) +from tools.agentic_import.state_handler import (PipelineState, StateHandler, + StepState) def _format_time(value: datetime) -> str: @@ -123,3 +126,139 @@ def build_pipeline_callback( return json_callback interactive = InteractiveCallback() return CompositeCallback([interactive, json_callback]) + + +@dataclass(frozen=True) +class SdmxPipelineConfig: + """User-configurable inputs that mimic planned CLI flags. + + This is a lightweight container; CLI parsing will be added in a later + phase. Defaults are intentionally minimal. + """ + + endpoint: str | None = None + agency: str | None = None + dataflow: str | None = None + key: str | None = None + dataset_prefix: str | None = None + working_dir: str | None = None + run_only: str | None = None + force: bool = False + verbose: bool = False + skip_confirmation: bool = False + + +class SdmxStep(Step): + """Base class for SDMX steps that carries immutable config and version.""" + + def __init__(self, *, name: str, version: int, + config: SdmxPipelineConfig) -> None: + if not name: + raise ValueError("step requires a name") + self._name = name + self._version = version + self._config = config + + @property + def name(self) -> str: + return self._name + + @property + def version(self) -> int: + return self._version + + # Subclasses must implement run() and dry_run(). + + +@dataclass(frozen=True) +class StepSpec: + phase: str + name: str + version: int + factory: Callable[[SdmxPipelineConfig], Step] + + @property + def full_name(self) -> str: + return f"{self.phase}.{self.name}" + + +@dataclass(frozen=True) +class PhaseSpec: + name: str + steps: Sequence[StepSpec] + + +@dataclass(frozen=True) +class SdmxPhaseRegistry: + phases: Sequence[PhaseSpec] + + def flatten(self) -> list[StepSpec]: + flattened: list[StepSpec] = [] + for phase in self.phases: + flattened.extend(phase.steps) + return flattened + + +class SdmxPipelineBuilder: + + def __init__(self, *, config: SdmxPipelineConfig, state: PipelineState, + registry: SdmxPhaseRegistry) -> None: + self._config = config + self._state = state + self._registry = registry + self._specs = registry.flatten() + + def build(self) -> Pipeline: + planned = self._plan_steps() + steps = [spec.factory(self._config) for spec in planned] + logging.info("Built SDMX pipeline with %d steps", len(steps)) + return Pipeline(steps=steps) + + def _plan_steps(self) -> list[StepSpec]: + specs = self._select_specs(self._specs, self._config.run_only) + if not specs: + return [] + force_all = bool(self._config.force and not self._config.run_only) + if force_all: + return list(specs) + scheduled: list[StepSpec] = [] + downstream = False + for spec in specs: + needs_run = self._should_run(spec) + if needs_run and not downstream: + downstream = True + if downstream: + scheduled.append(spec) + if not scheduled: + logging.info("No steps scheduled; all steps current") + return scheduled + + def _select_specs(self, specs: Sequence[StepSpec], + run_only: str | None) -> list[StepSpec]: + if not run_only: + return list(specs) + if "." in run_only: + scoped = [s for s in specs if s.full_name == run_only] + if not scoped: + raise ValueError(f"run_only target not found: {run_only}") + return scoped + scoped = [s for s in specs if s.phase == run_only] + if not scoped: + raise ValueError(f"run_only phase not found: {run_only}") + return scoped + + def _should_run(self, spec: StepSpec) -> bool: + prev = self._state.steps.get(spec.full_name) + if prev is None: + return True + if prev.status != "succeeded": + return True + if prev.version < spec.version: + return True + return False + + +def build_sdmx_pipeline(*, config: SdmxPipelineConfig, state: PipelineState, + registry: SdmxPhaseRegistry) -> Pipeline: + builder = SdmxPipelineBuilder(config=config, state=state, registry=registry) + return builder.build() diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index a48e1b153b..3c8f4690f6 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -39,8 +39,11 @@ RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error - InteractiveCallback, JSONStateCallback, build_pipeline_callback) -from tools.agentic_import.state_handler import StateHandler # pylint: disable=import-error + InteractiveCallback, JSONStateCallback, SdmxPipelineBuilder, + SdmxPipelineConfig, SdmxPhaseRegistry, PhaseSpec, StepSpec, SdmxStep, + build_pipeline_callback, build_sdmx_pipeline) +from tools.agentic_import.state_handler import ( # pylint: disable=import-error + PipelineState, StateHandler, StepState) class _IncrementingClock: @@ -262,5 +265,138 @@ def test_interactive_mode_returns_composite(self) -> None: self.assertIsInstance(callback, CompositeCallback) +class _TestStep(SdmxStep): + + def run(self) -> None: + pass + + def dry_run(self) -> None: + logging.info("noop") + + +class PlanningTest(unittest.TestCase): + + def _mk_spec(self, phase: str, name: str, version: int) -> StepSpec: + full = f"{phase}.{name}" + + def _factory(cfg: SdmxPipelineConfig) -> _TestStep: + return _TestStep(name=full, version=version, config=cfg) + + return StepSpec(phase=phase, + name=name, + version=version, + factory=_factory) + + def _mk_registry(self) -> SdmxPhaseRegistry: + download = PhaseSpec( + name="download", + steps=[ + self._mk_spec("download", "fetch", 1), + self._mk_spec("download", "preview", 1) + ], + ) + process = PhaseSpec( + name="process", + steps=[self._mk_spec("process", "clean", 1)], + ) + export = PhaseSpec( + name="export", + steps=[self._mk_spec("export", "write", 1)], + ) + return SdmxPhaseRegistry(phases=[download, process, export]) + + def _empty_state(self) -> PipelineState: + return PipelineState(run_id="demo", + critical_input_hash="", + command="", + updated_at="", + steps={}) + + def _state_with(self, versions: dict[str, tuple[int, + str]]) -> PipelineState: + steps = { + name: + StepState(version=v, + status=st, + started_at="t", + ended_at="t", + duration_s=0.0) for name, (v, st) in versions.items() + } + return PipelineState(run_id="demo", + critical_input_hash="", + command="", + updated_at="", + steps=steps) + + def _names_from_builder(self, + cfg: SdmxPipelineConfig, + reg: SdmxPhaseRegistry, + state: PipelineState | None = None) -> list[str]: + builder = SdmxPipelineBuilder(config=cfg, + state=state or self._empty_state(), + registry=reg) + pipeline = builder.build() + return [step.name for step in pipeline.get_steps()] + + def test_run_only_phase_and_step(self) -> None: + reg = self._mk_registry() + cfg_phase = SdmxPipelineConfig(run_only="download") + names_phase = self._names_from_builder(cfg_phase, reg) + self.assertEqual(names_phase, ["download.fetch", "download.preview"]) + + cfg_step = SdmxPipelineConfig(run_only="download.fetch") + names_step = self._names_from_builder(cfg_step, reg) + self.assertEqual(names_step, ["download.fetch"]) + + with self.assertRaisesRegex(ValueError, "run_only phase not found"): + self._names_from_builder(SdmxPipelineConfig(run_only="nope"), reg) + with self.assertRaisesRegex(ValueError, "run_only target not found"): + self._names_from_builder( + SdmxPipelineConfig(run_only="download.nope"), reg) + + def test_force_semantics(self) -> None: + reg = self._mk_registry() + cfg_all = SdmxPipelineConfig(force=True) + names_all = self._names_from_builder(cfg_all, reg) + self.assertEqual(names_all, [ + "download.fetch", + "download.preview", + "process.clean", + "export.write", + ]) + + cfg_phase = SdmxPipelineConfig(run_only="download", force=True) + names_phase = self._names_from_builder(cfg_phase, reg) + self.assertEqual(names_phase, ["download.fetch", "download.preview"]) + + def test_version_bump_schedules_downstream(self) -> None: + # Make process.clean a new version while others remain the same. + download = PhaseSpec( + name="download", + steps=[self._mk_spec("download", "fetch", 1)], + ) + process = PhaseSpec( + name="process", + steps=[self._mk_spec("process", "clean", 2)], + ) + export = PhaseSpec( + name="export", + steps=[self._mk_spec("export", "write", 1)], + ) + reg = SdmxPhaseRegistry(phases=[download, process, export]) + state = self._state_with({ + "download.fetch": (1, "succeeded"), + "process.clean": (1, "succeeded"), + "export.write": (1, "succeeded"), + }) + cfg = SdmxPipelineConfig() + names = self._names_from_builder(cfg, reg, state) + self.assertEqual(names, ["process.clean", "export.write"]) + + pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg) + self.assertEqual([s.name for s in pipeline.get_steps()], + ["process.clean", "export.write"]) + + if __name__ == "__main__": unittest.main() From 561d61cc4926ec43df7ab70c57d3be33c6a91024 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 11 Nov 2025 14:41:41 +0000 Subject: [PATCH 08/54] Generalize SDMX pipeline naming Add placeholder steps for download/process/config phases Refresh tests for the renamed builder API --- tools/agentic_import/sdmx_import_pipeline.py | 118 ++++++++++++++++-- .../sdmx_import_pipeline_test.py | 40 +++--- 2 files changed, 128 insertions(+), 30 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index a0d74f797d..83b71d3ec1 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -129,7 +129,7 @@ def build_pipeline_callback( @dataclass(frozen=True) -class SdmxPipelineConfig: +class PipelineConfig: """User-configurable inputs that mimic planned CLI flags. This is a lightweight container; CLI parsing will be added in a later @@ -152,7 +152,7 @@ class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" def __init__(self, *, name: str, version: int, - config: SdmxPipelineConfig) -> None: + config: PipelineConfig) -> None: if not name: raise ValueError("step requires a name") self._name = name @@ -170,12 +170,110 @@ def version(self) -> int: # Subclasses must implement run() and dry_run(). +class DownloadDataStep(SdmxStep): + """Downloads SDMX data payloads.""" + + VERSION = 1 + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + + def run(self) -> None: + logging.info( + f"{self.name}: no-op implementation for VERSION={self.VERSION}") + + def dry_run(self) -> None: + logging.info(f"{self.name} (dry run): previewing data download inputs") + + +class DownloadMetadataStep(SdmxStep): + """Downloads SDMX metadata payloads.""" + + VERSION = 1 + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + + def run(self) -> None: + logging.info( + f"{self.name}: no-op implementation for VERSION={self.VERSION}") + + def dry_run(self) -> None: + logging.info( + f"{self.name} (dry run): previewing metadata download inputs") + + +class CreateSampleStep(SdmxStep): + """Creates a sample dataset from downloaded data.""" + + VERSION = 1 + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + + def run(self) -> None: + logging.info( + f"{self.name}: no-op implementation for VERSION={self.VERSION}") + + def dry_run(self) -> None: + logging.info(f"{self.name} (dry run): previewing sample generation") + + +class CreateSchemaMapStep(SdmxStep): + """Builds schema mappings for transformed data.""" + + VERSION = 1 + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + + def run(self) -> None: + logging.info( + f"{self.name}: no-op implementation for VERSION={self.VERSION}") + + def dry_run(self) -> None: + logging.info( + f"{self.name} (dry run): previewing schema mapping outputs") + + +class ProcessFullDataStep(SdmxStep): + """Processes full SDMX data into DC artifacts.""" + + VERSION = 1 + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + + def run(self) -> None: + logging.info( + f"{self.name}: no-op implementation for VERSION={self.VERSION}") + + def dry_run(self) -> None: + logging.info(f"{self.name} (dry run): previewing full-data processing") + + +class CreateDcConfigStep(SdmxStep): + """Generates Datacommons configuration artifacts.""" + + VERSION = 1 + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + + def run(self) -> None: + logging.info( + f"{self.name}: no-op implementation for VERSION={self.VERSION}") + + def dry_run(self) -> None: + logging.info(f"{self.name} (dry run): previewing DC config creation") + + @dataclass(frozen=True) class StepSpec: phase: str name: str version: int - factory: Callable[[SdmxPipelineConfig], Step] + factory: Callable[[PipelineConfig], Step] @property def full_name(self) -> str: @@ -189,7 +287,7 @@ class PhaseSpec: @dataclass(frozen=True) -class SdmxPhaseRegistry: +class PhaseRegistry: phases: Sequence[PhaseSpec] def flatten(self) -> list[StepSpec]: @@ -199,10 +297,10 @@ def flatten(self) -> list[StepSpec]: return flattened -class SdmxPipelineBuilder: +class PipelineBuilder: - def __init__(self, *, config: SdmxPipelineConfig, state: PipelineState, - registry: SdmxPhaseRegistry) -> None: + def __init__(self, *, config: PipelineConfig, state: PipelineState, + registry: PhaseRegistry) -> None: self._config = config self._state = state self._registry = registry @@ -258,7 +356,7 @@ def _should_run(self, spec: StepSpec) -> bool: return False -def build_sdmx_pipeline(*, config: SdmxPipelineConfig, state: PipelineState, - registry: SdmxPhaseRegistry) -> Pipeline: - builder = SdmxPipelineBuilder(config=config, state=state, registry=registry) +def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, + registry: PhaseRegistry) -> Pipeline: + builder = PipelineBuilder(config=config, state=state, registry=registry) return builder.build() diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 3c8f4690f6..8e023e6f0d 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -39,9 +39,9 @@ RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error - InteractiveCallback, JSONStateCallback, SdmxPipelineBuilder, - SdmxPipelineConfig, SdmxPhaseRegistry, PhaseSpec, StepSpec, SdmxStep, - build_pipeline_callback, build_sdmx_pipeline) + InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, + PhaseRegistry, PhaseSpec, StepSpec, SdmxStep, build_pipeline_callback, + build_sdmx_pipeline) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -279,7 +279,7 @@ class PlanningTest(unittest.TestCase): def _mk_spec(self, phase: str, name: str, version: int) -> StepSpec: full = f"{phase}.{name}" - def _factory(cfg: SdmxPipelineConfig) -> _TestStep: + def _factory(cfg: PipelineConfig) -> _TestStep: return _TestStep(name=full, version=version, config=cfg) return StepSpec(phase=phase, @@ -287,7 +287,7 @@ def _factory(cfg: SdmxPipelineConfig) -> _TestStep: version=version, factory=_factory) - def _mk_registry(self) -> SdmxPhaseRegistry: + def _mk_registry(self) -> PhaseRegistry: download = PhaseSpec( name="download", steps=[ @@ -303,7 +303,7 @@ def _mk_registry(self) -> SdmxPhaseRegistry: name="export", steps=[self._mk_spec("export", "write", 1)], ) - return SdmxPhaseRegistry(phases=[download, process, export]) + return PhaseRegistry(phases=[download, process, export]) def _empty_state(self) -> PipelineState: return PipelineState(run_id="demo", @@ -329,34 +329,34 @@ def _state_with(self, versions: dict[str, tuple[int, steps=steps) def _names_from_builder(self, - cfg: SdmxPipelineConfig, - reg: SdmxPhaseRegistry, + cfg: PipelineConfig, + reg: PhaseRegistry, state: PipelineState | None = None) -> list[str]: - builder = SdmxPipelineBuilder(config=cfg, - state=state or self._empty_state(), - registry=reg) + builder = PipelineBuilder(config=cfg, + state=state or self._empty_state(), + registry=reg) pipeline = builder.build() return [step.name for step in pipeline.get_steps()] def test_run_only_phase_and_step(self) -> None: reg = self._mk_registry() - cfg_phase = SdmxPipelineConfig(run_only="download") + cfg_phase = PipelineConfig(run_only="download") names_phase = self._names_from_builder(cfg_phase, reg) self.assertEqual(names_phase, ["download.fetch", "download.preview"]) - cfg_step = SdmxPipelineConfig(run_only="download.fetch") + cfg_step = PipelineConfig(run_only="download.fetch") names_step = self._names_from_builder(cfg_step, reg) self.assertEqual(names_step, ["download.fetch"]) with self.assertRaisesRegex(ValueError, "run_only phase not found"): - self._names_from_builder(SdmxPipelineConfig(run_only="nope"), reg) + self._names_from_builder(PipelineConfig(run_only="nope"), reg) with self.assertRaisesRegex(ValueError, "run_only target not found"): - self._names_from_builder( - SdmxPipelineConfig(run_only="download.nope"), reg) + self._names_from_builder(PipelineConfig(run_only="download.nope"), + reg) def test_force_semantics(self) -> None: reg = self._mk_registry() - cfg_all = SdmxPipelineConfig(force=True) + cfg_all = PipelineConfig(force=True) names_all = self._names_from_builder(cfg_all, reg) self.assertEqual(names_all, [ "download.fetch", @@ -365,7 +365,7 @@ def test_force_semantics(self) -> None: "export.write", ]) - cfg_phase = SdmxPipelineConfig(run_only="download", force=True) + cfg_phase = PipelineConfig(run_only="download", force=True) names_phase = self._names_from_builder(cfg_phase, reg) self.assertEqual(names_phase, ["download.fetch", "download.preview"]) @@ -383,13 +383,13 @@ def test_version_bump_schedules_downstream(self) -> None: name="export", steps=[self._mk_spec("export", "write", 1)], ) - reg = SdmxPhaseRegistry(phases=[download, process, export]) + reg = PhaseRegistry(phases=[download, process, export]) state = self._state_with({ "download.fetch": (1, "succeeded"), "process.clean": (1, "succeeded"), "export.write": (1, "succeeded"), }) - cfg = SdmxPipelineConfig() + cfg = PipelineConfig() names = self._names_from_builder(cfg, reg, state) self.assertEqual(names, ["process.clean", "export.write"]) From 0cfcc217078e45afd78bf0294019689185e7a852 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 11 Nov 2025 16:35:31 +0000 Subject: [PATCH 09/54] Use explicit SDMX registry builder Update pipeline builder and tests to rely on the canonical registry and step names --- tools/agentic_import/sdmx_import_pipeline.py | 48 +++++++- .../sdmx_import_pipeline_test.py | 115 ++++++++---------- 2 files changed, 96 insertions(+), 67 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 83b71d3ec1..7264395630 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -304,7 +304,7 @@ def __init__(self, *, config: PipelineConfig, state: PipelineState, self._config = config self._state = state self._registry = registry - self._specs = registry.flatten() + self._specs = self._registry.flatten() def build(self) -> Pipeline: planned = self._plan_steps() @@ -356,7 +356,49 @@ def _should_run(self, spec: StepSpec) -> bool: return False -def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, - registry: PhaseRegistry) -> Pipeline: + + + +def build_registry() -> PhaseRegistry: + """Constructs the hard-coded Phase 2 registry with canonical steps.""" + def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec: + full = f"{phase}.{step}" + return StepSpec( + phase=phase, + name=step, + version=cls.VERSION, + factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name, config=cfg), + ) + + download = PhaseSpec( + name="download", + steps=[ + _spec("download", "download-data", DownloadDataStep), + _spec("download", "download-metadata", DownloadMetadataStep), + ], + ) + sample = PhaseSpec( + name="sample", + steps=[ + _spec("sample", "create-sample", CreateSampleStep), + ], + ) + schema_map = PhaseSpec( + name="schema_map", + steps=[ + _spec("schema_map", "create-schema-mapping", CreateSchemaMapStep), + ], + ) + transform = PhaseSpec( + name="transform", + steps=[ + _spec("transform", "process-full-data", ProcessFullDataStep), + _spec("transform", "create-dc-config", CreateDcConfigStep), + ], + ) + return PhaseRegistry(phases=[download, sample, schema_map, transform]) + +def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, registry: PhaseRegistry) -> Pipeline: builder = PipelineBuilder(config=config, state=state, registry=registry) return builder.build() + diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 8e023e6f0d..3565c4ed61 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -39,9 +39,10 @@ RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error + CreateDcConfigStep, DownloadDataStep, ProcessFullDataStep, InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, PhaseRegistry, PhaseSpec, StepSpec, SdmxStep, build_pipeline_callback, - build_sdmx_pipeline) + build_registry, build_sdmx_pipeline) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -265,45 +266,10 @@ def test_interactive_mode_returns_composite(self) -> None: self.assertIsInstance(callback, CompositeCallback) -class _TestStep(SdmxStep): - - def run(self) -> None: - pass - - def dry_run(self) -> None: - logging.info("noop") - - class PlanningTest(unittest.TestCase): - def _mk_spec(self, phase: str, name: str, version: int) -> StepSpec: - full = f"{phase}.{name}" - - def _factory(cfg: PipelineConfig) -> _TestStep: - return _TestStep(name=full, version=version, config=cfg) - - return StepSpec(phase=phase, - name=name, - version=version, - factory=_factory) - def _mk_registry(self) -> PhaseRegistry: - download = PhaseSpec( - name="download", - steps=[ - self._mk_spec("download", "fetch", 1), - self._mk_spec("download", "preview", 1) - ], - ) - process = PhaseSpec( - name="process", - steps=[self._mk_spec("process", "clean", 1)], - ) - export = PhaseSpec( - name="export", - steps=[self._mk_spec("export", "write", 1)], - ) - return PhaseRegistry(phases=[download, process, export]) + return build_registry() def _empty_state(self) -> PipelineState: return PipelineState(run_id="demo", @@ -342,11 +308,11 @@ def test_run_only_phase_and_step(self) -> None: reg = self._mk_registry() cfg_phase = PipelineConfig(run_only="download") names_phase = self._names_from_builder(cfg_phase, reg) - self.assertEqual(names_phase, ["download.fetch", "download.preview"]) + self.assertEqual(names_phase, ["download.download-data", "download.download-metadata"]) - cfg_step = PipelineConfig(run_only="download.fetch") + cfg_step = PipelineConfig(run_only="download.download-data") names_step = self._names_from_builder(cfg_step, reg) - self.assertEqual(names_step, ["download.fetch"]) + self.assertEqual(names_step, ["download.download-data"]) with self.assertRaisesRegex(ValueError, "run_only phase not found"): self._names_from_builder(PipelineConfig(run_only="nope"), reg) @@ -359,43 +325,64 @@ def test_force_semantics(self) -> None: cfg_all = PipelineConfig(force=True) names_all = self._names_from_builder(cfg_all, reg) self.assertEqual(names_all, [ - "download.fetch", - "download.preview", - "process.clean", - "export.write", + "download.download-data", + "download.download-metadata", + "sample.create-sample", + "schema_map.create-schema-mapping", + "transform.process-full-data", + "transform.create-dc-config", ]) cfg_phase = PipelineConfig(run_only="download", force=True) names_phase = self._names_from_builder(cfg_phase, reg) - self.assertEqual(names_phase, ["download.fetch", "download.preview"]) + self.assertEqual(names_phase, + ["download.download-data", "download.download-metadata"]) def test_version_bump_schedules_downstream(self) -> None: - # Make process.clean a new version while others remain the same. - download = PhaseSpec( - name="download", - steps=[self._mk_spec("download", "fetch", 1)], - ) - process = PhaseSpec( - name="process", - steps=[self._mk_spec("process", "clean", 2)], - ) - export = PhaseSpec( - name="export", - steps=[self._mk_spec("export", "write", 1)], - ) - reg = PhaseRegistry(phases=[download, process, export]) + reg = PhaseRegistry(phases=[ + PhaseSpec( + name="download", + steps=[ + StepSpec( + phase="download", + name="download-data", + version=1, + factory=lambda cfg: DownloadDataStep( + name="download.download-data", config=cfg)), + ] + ), + PhaseSpec( + name="transform", + steps=[ + StepSpec( + phase="transform", + name="process-full-data", + version=2, + factory=lambda cfg: ProcessFullDataStep( + name="transform.process-full-data", config=cfg)), + StepSpec( + phase="transform", + name="create-dc-config", + version=1, + factory=lambda cfg: CreateDcConfigStep( + name="transform.create-dc-config", config=cfg)), + ] + ) + ]) state = self._state_with({ - "download.fetch": (1, "succeeded"), - "process.clean": (1, "succeeded"), - "export.write": (1, "succeeded"), + "download.download-data": (1, "succeeded"), + "transform.process-full-data": (1, "succeeded"), + "transform.create-dc-config": (1, "succeeded"), }) cfg = PipelineConfig() names = self._names_from_builder(cfg, reg, state) - self.assertEqual(names, ["process.clean", "export.write"]) + self.assertEqual(names, [ + "transform.process-full-data", "transform.create-dc-config" + ]) pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg) self.assertEqual([s.name for s in pipeline.get_steps()], - ["process.clean", "export.write"]) + ["transform.process-full-data", "transform.create-dc-config"]) if __name__ == "__main__": From 8d3525ad11b9039434fd315c4ac9e50d1c44d745 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Wed, 12 Nov 2025 05:04:28 +0000 Subject: [PATCH 10/54] feat(sdmx-import): Add timestamps and improve pipeline planning This commit introduces two main improvements to the SDMX import pipeline: 1. **Timestamps in State:** Adds and (millisecond-precision Unix timestamps) to both the individual step state and the overall pipeline state. This allows for more precise and reliable comparisons of step execution times. 2. **Smarter Pipeline Planning:** The pipeline planning logic is now more robust. It will now re-run a step if any of its preceding steps have been executed more recently, even if the step itself is marked as succeeded. This ensures that downstream steps are always up-to-date with their dependencies. This commit also includes: - Refactoring of the step selection logic for clarity. - Addition of a helper function. - Updated tests to reflect these changes. --- tools/agentic_import/sdmx_import_pipeline.py | 83 +++++++---- .../sdmx_import_pipeline_test.py | 139 ++++++++++++------ tools/agentic_import/state_handler.py | 4 + tools/agentic_import/state_handler_test.py | 2 + 4 files changed, 156 insertions(+), 72 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 7264395630..3e757e0fc3 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -80,6 +80,8 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None: if started_at is None: started_at = ended_at duration = max(0.0, (ended_at - started_at).total_seconds()) + started_at_ts = int(started_at.timestamp() * 1000) + ended_at_ts = int(ended_at.timestamp() * 1000) if isinstance(error, PipelineAbort): logging.info( f"Skipping state update for {step.name} due to pipeline abort") @@ -97,10 +99,13 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None: started_at=_format_time(started_at), ended_at=_format_time(ended_at), duration_s=duration, + started_at_ts=started_at_ts, + ended_at_ts=ended_at_ts, message=message, ) self._state.steps[step.name] = step_state self._state.updated_at = step_state.ended_at + self._state.updated_at_ts = ended_at_ts self._handler.save_state() def _now(self) -> datetime: @@ -277,7 +282,7 @@ class StepSpec: @property def full_name(self) -> str: - return f"{self.phase}.{self.name}" + return _format_full_name(self.phase, self.name) @dataclass(frozen=True) @@ -313,36 +318,39 @@ def build(self) -> Pipeline: return Pipeline(steps=steps) def _plan_steps(self) -> list[StepSpec]: - specs = self._select_specs(self._specs, self._config.run_only) - if not specs: - return [] - force_all = bool(self._config.force and not self._config.run_only) - if force_all: - return list(specs) + if self._config.run_only: + return self._filter_run_only(self._specs, self._config.run_only) + if self._config.force: + return list(self._specs) scheduled: list[StepSpec] = [] - downstream = False - for spec in specs: - needs_run = self._should_run(spec) - if needs_run and not downstream: - downstream = True - if downstream: + schedule_all_remaining = False + previous: StepSpec | None = None + for spec in self._specs: + if schedule_all_remaining: scheduled.append(spec) + else: + needs_run = self._should_run(spec) + if not needs_run and previous is not None: + needs_run = self._predecessor_newer(previous, spec) + if needs_run: + scheduled.append(spec) + schedule_all_remaining = True + previous = spec if not scheduled: - logging.info("No steps scheduled; all steps current") + logging.info("No steps scheduled.") return scheduled - def _select_specs(self, specs: Sequence[StepSpec], - run_only: str | None) -> list[StepSpec]: - if not run_only: - return list(specs) - if "." in run_only: + def _filter_run_only(self, specs: Sequence[StepSpec], + run_only: str) -> list[StepSpec]: + is_step = "." in run_only + if is_step: scoped = [s for s in specs if s.full_name == run_only] - if not scoped: - raise ValueError(f"run_only target not found: {run_only}") - return scoped - scoped = [s for s in specs if s.phase == run_only] + else: + scoped = [s for s in specs if s.phase == run_only] + if not scoped: - raise ValueError(f"run_only phase not found: {run_only}") + entity = "step" if is_step else "phase" + raise ValueError(f"run_only {entity} not found: {run_only}") return scoped def _should_run(self, spec: StepSpec) -> bool: @@ -355,19 +363,31 @@ def _should_run(self, spec: StepSpec) -> bool: return True return False - - + def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool: + prev_state = self._state.steps.get(prev_spec.full_name) + curr_state = self._state.steps.get(spec.full_name) + if prev_state is None or prev_state.ended_at_ts is None: + return False + if curr_state is None: + return True + if curr_state.status != "succeeded": + return True + if curr_state.ended_at_ts is None: + return True + return prev_state.ended_at_ts > curr_state.ended_at_ts def build_registry() -> PhaseRegistry: """Constructs the hard-coded Phase 2 registry with canonical steps.""" + def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec: - full = f"{phase}.{step}" + full = _format_full_name(phase, step) return StepSpec( phase=phase, name=step, version=cls.VERSION, - factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name, config=cfg), + factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name, + config=cfg), ) download = PhaseSpec( @@ -398,7 +418,12 @@ def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec: ) return PhaseRegistry(phases=[download, sample, schema_map, transform]) -def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, registry: PhaseRegistry) -> Pipeline: + +def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, + registry: PhaseRegistry) -> Pipeline: builder = PipelineBuilder(config=config, state=state, registry=registry) return builder.build() + +def _format_full_name(phase: str, step: str) -> str: + return f"{phase}.{step}" diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 3565c4ed61..0f26c61a7e 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -116,6 +116,13 @@ def test_successful_step_persists_expected_schema(self) -> None: self.assertIn("message", step_state) self.assertIsNone(step_state["message"]) self.assertEqual(state["updated_at"], step_state["ended_at"]) + ended_at_dt = datetime.fromisoformat(step_state["ended_at"]) + started_at_dt = datetime.fromisoformat(step_state["started_at"]) + self.assertEqual(step_state["ended_at_ts"], + int(ended_at_dt.timestamp() * 1000)) + self.assertEqual(step_state["started_at_ts"], + int(started_at_dt.timestamp() * 1000)) + self.assertEqual(state["updated_at_ts"], step_state["ended_at_ts"]) def test_failed_step_records_error_and_persists_file(self) -> None: clock = _IncrementingClock( @@ -138,6 +145,8 @@ def test_failed_step_records_error_and_persists_file(self) -> None: self.assertEqual(step_state["status"], "failed") self.assertIn("boom", step_state["message"]) self.assertAlmostEqual(step_state["duration_s"], 7.0) + self.assertIn("ended_at_ts", step_state) + self.assertIn("started_at_ts", step_state) def test_abort_skips_state_persistence(self) -> None: clock = _IncrementingClock( @@ -152,12 +161,15 @@ def test_abort_skips_state_persistence(self) -> None: "critical_input_hash": "old", "command": "old command", "updated_at": "2025-01-01T00:00:00Z", + "updated_at_ts": 1, "steps": { "existing.step": { "version": 1, "status": "succeeded", "started_at": "2025-01-01T00:00:00Z", + "started_at_ts": 0, "ended_at": "2025-01-01T00:05:00Z", + "ended_at_ts": 300000, "duration_s": 300.0, "message": None, } @@ -278,20 +290,26 @@ def _empty_state(self) -> PipelineState: updated_at="", steps={}) - def _state_with(self, versions: dict[str, tuple[int, - str]]) -> PipelineState: + def _state_with( + self, versions: dict[str, tuple[int, str, + int | None]]) -> PipelineState: steps = { name: StepState(version=v, status=st, started_at="t", ended_at="t", - duration_s=0.0) for name, (v, st) in versions.items() + duration_s=0.0, + started_at_ts=ts, + ended_at_ts=ts, + message=None) + for name, (v, st, ts) in versions.items() } return PipelineState(run_id="demo", critical_input_hash="", command="", updated_at="", + updated_at_ts=None, steps=steps) def _names_from_builder(self, @@ -308,7 +326,9 @@ def test_run_only_phase_and_step(self) -> None: reg = self._mk_registry() cfg_phase = PipelineConfig(run_only="download") names_phase = self._names_from_builder(cfg_phase, reg) - self.assertEqual(names_phase, ["download.download-data", "download.download-metadata"]) + self.assertEqual( + names_phase, + ["download.download-data", "download.download-metadata"]) cfg_step = PipelineConfig(run_only="download.download-data") names_step = self._names_from_builder(cfg_step, reg) @@ -316,7 +336,7 @@ def test_run_only_phase_and_step(self) -> None: with self.assertRaisesRegex(ValueError, "run_only phase not found"): self._names_from_builder(PipelineConfig(run_only="nope"), reg) - with self.assertRaisesRegex(ValueError, "run_only target not found"): + with self.assertRaisesRegex(ValueError, "run_only step not found"): self._names_from_builder(PipelineConfig(run_only="download.nope"), reg) @@ -335,54 +355,87 @@ def test_force_semantics(self) -> None: cfg_phase = PipelineConfig(run_only="download", force=True) names_phase = self._names_from_builder(cfg_phase, reg) - self.assertEqual(names_phase, - ["download.download-data", "download.download-metadata"]) + self.assertEqual( + names_phase, + ["download.download-data", "download.download-metadata"]) + + def test_timestamp_chaining_triggers_next_step(self) -> None: + reg = self._mk_registry() + newer = 2_000 + older = 1_000 + state = self._state_with({ + "download.download-data": (1, "succeeded", newer), + "download.download-metadata": (1, "succeeded", older), + "sample.create-sample": (1, "succeeded", older), + "schema_map.create-schema-mapping": (1, "succeeded", older), + "transform.process-full-data": (1, "succeeded", older), + "transform.create-dc-config": (1, "succeeded", older), + }) + cfg = PipelineConfig() + names = self._names_from_builder(cfg, reg, state) + self.assertEqual(names, [ + "download.download-metadata", + "sample.create-sample", + "schema_map.create-schema-mapping", + "transform.process-full-data", + "transform.create-dc-config", + ]) + + def test_run_only_ignores_timestamp_chaining(self) -> None: + reg = self._mk_registry() + newer = 4_000 + older = 3_000 + state = self._state_with({ + "download.download-data": (1, "succeeded", newer), + "download.download-metadata": (1, "succeeded", older), + }) + cfg = PipelineConfig(run_only="download") + names = self._names_from_builder(cfg, reg, state) + self.assertEqual( + names, ["download.download-data", "download.download-metadata"]) def test_version_bump_schedules_downstream(self) -> None: reg = PhaseRegistry(phases=[ - PhaseSpec( - name="download", - steps=[ - StepSpec( - phase="download", - name="download-data", - version=1, - factory=lambda cfg: DownloadDataStep( - name="download.download-data", config=cfg)), - ] - ), - PhaseSpec( - name="transform", - steps=[ - StepSpec( - phase="transform", - name="process-full-data", - version=2, - factory=lambda cfg: ProcessFullDataStep( - name="transform.process-full-data", config=cfg)), - StepSpec( - phase="transform", - name="create-dc-config", - version=1, - factory=lambda cfg: CreateDcConfigStep( - name="transform.create-dc-config", config=cfg)), - ] - ) + PhaseSpec(name="download", + steps=[ + StepSpec( + phase="download", + name="download-data", + version=1, + factory=lambda cfg: DownloadDataStep( + name="download.download-data", config=cfg)), + ]), + PhaseSpec(name="transform", + steps=[ + StepSpec(phase="transform", + name="process-full-data", + version=2, + factory=lambda cfg: ProcessFullDataStep( + name="transform.process-full-data", + config=cfg)), + StepSpec(phase="transform", + name="create-dc-config", + version=1, + factory=lambda cfg: CreateDcConfigStep( + name="transform.create-dc-config", + config=cfg)), + ]) ]) state = self._state_with({ - "download.download-data": (1, "succeeded"), - "transform.process-full-data": (1, "succeeded"), - "transform.create-dc-config": (1, "succeeded"), + "download.download-data": (1, "succeeded", 1000), + "transform.process-full-data": (1, "succeeded", 1000), + "transform.create-dc-config": (1, "succeeded", 1000), }) cfg = PipelineConfig() names = self._names_from_builder(cfg, reg, state) - self.assertEqual(names, [ - "transform.process-full-data", "transform.create-dc-config" - ]) + self.assertEqual( + names, + ["transform.process-full-data", "transform.create-dc-config"]) pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg) - self.assertEqual([s.name for s in pipeline.get_steps()], - ["transform.process-full-data", "transform.create-dc-config"]) + self.assertEqual( + [s.name for s in pipeline.get_steps()], + ["transform.process-full-data", "transform.create-dc-config"]) if __name__ == "__main__": diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py index cb2cb85b02..f4d2af50b0 100644 --- a/tools/agentic_import/state_handler.py +++ b/tools/agentic_import/state_handler.py @@ -37,6 +37,8 @@ class StepState: started_at: str ended_at: str duration_s: float + started_at_ts: int | None = None + ended_at_ts: int | None = None message: str | None = None @@ -47,6 +49,7 @@ class PipelineState: critical_input_hash: str command: str updated_at: str + updated_at_ts: int | None = None steps: dict[str, StepState] = field(default_factory=dict) @@ -108,6 +111,7 @@ def _empty_state(self) -> PipelineState: critical_input_hash="", command="", updated_at="", + updated_at_ts=None, ) def _backup_bad_file(self) -> None: diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py index b8af010fd7..3d3ad3a1d8 100644 --- a/tools/agentic_import/state_handler_test.py +++ b/tools/agentic_import/state_handler_test.py @@ -47,6 +47,7 @@ def test_missing_file_creates_empty_state(self) -> None: data = json.load(fp) self.assertEqual(data["run_id"], "demo") self.assertEqual(data["steps"], {}) + self.assertIsNone(data["updated_at_ts"]) def test_corrupt_file_creates_backup_and_resets_state(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: @@ -67,6 +68,7 @@ def test_corrupt_file_creates_backup_and_resets_state(self) -> None: with open(path, encoding="utf-8") as fp: data = json.load(fp) self.assertEqual(data["steps"], {}) + self.assertIsNone(data.get("updated_at_ts")) if __name__ == "__main__": From d6b0c9868b9757bcc079473e4218646532676886 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 12 Nov 2025 09:59:21 +0000 Subject: [PATCH 11/54] refactor(sdmx-import): Simplify phase factory lambda The factory lambda in for is simplified by removing redundant and arguments, as these are already captured in the closure. --- tools/agentic_import/sdmx_import_pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 3e757e0fc3..65fe07ab22 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -386,8 +386,7 @@ def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec: phase=phase, name=step, version=cls.VERSION, - factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name, - config=cfg), + factory=lambda cfg: cls(name=full, config=cfg), ) download = PhaseSpec( From 7f15f17a85adb389be0d31f21f76885e52180ceb Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 12 Nov 2025 10:22:16 +0000 Subject: [PATCH 12/54] Refactor(sdmx-import): Flatten pipeline steps and remove phases Removes the concept of Phases from the SDMX import pipeline, opting for a flat list of steps. This simplifies the pipeline structure and updates tests accordingly. --- tools/agentic_import/sdmx_import_pipeline.py | 98 +++-------- .../sdmx_import_pipeline_test.py | 156 +++++++----------- 2 files changed, 86 insertions(+), 168 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 65fe07ab22..36b10d5e24 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -275,41 +275,18 @@ def dry_run(self) -> None: @dataclass(frozen=True) class StepSpec: - phase: str name: str version: int factory: Callable[[PipelineConfig], Step] - @property - def full_name(self) -> str: - return _format_full_name(self.phase, self.name) - - -@dataclass(frozen=True) -class PhaseSpec: - name: str - steps: Sequence[StepSpec] - - -@dataclass(frozen=True) -class PhaseRegistry: - phases: Sequence[PhaseSpec] - - def flatten(self) -> list[StepSpec]: - flattened: list[StepSpec] = [] - for phase in self.phases: - flattened.extend(phase.steps) - return flattened - class PipelineBuilder: def __init__(self, *, config: PipelineConfig, state: PipelineState, - registry: PhaseRegistry) -> None: + steps: Sequence[StepSpec]) -> None: self._config = config self._state = state - self._registry = registry - self._specs = self._registry.flatten() + self._specs = steps def build(self) -> Pipeline: planned = self._plan_steps() @@ -342,19 +319,13 @@ def _plan_steps(self) -> list[StepSpec]: def _filter_run_only(self, specs: Sequence[StepSpec], run_only: str) -> list[StepSpec]: - is_step = "." in run_only - if is_step: - scoped = [s for s in specs if s.full_name == run_only] - else: - scoped = [s for s in specs if s.phase == run_only] - + scoped = [s for s in specs if s.name == run_only] if not scoped: - entity = "step" if is_step else "phase" - raise ValueError(f"run_only {entity} not found: {run_only}") + raise ValueError(f"run_only step not found: {run_only}") return scoped def _should_run(self, spec: StepSpec) -> bool: - prev = self._state.steps.get(spec.full_name) + prev = self._state.steps.get(spec.name) if prev is None: return True if prev.status != "succeeded": @@ -364,8 +335,8 @@ def _should_run(self, spec: StepSpec) -> bool: return False def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool: - prev_state = self._state.steps.get(prev_spec.full_name) - curr_state = self._state.steps.get(spec.full_name) + prev_state = self._state.steps.get(prev_spec.name) + curr_state = self._state.steps.get(spec.name) if prev_state is None or prev_state.ended_at_ts is None: return False if curr_state is None: @@ -377,52 +348,27 @@ def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool: return prev_state.ended_at_ts > curr_state.ended_at_ts -def build_registry() -> PhaseRegistry: - """Constructs the hard-coded Phase 2 registry with canonical steps.""" +def build_step_specs() -> list[StepSpec]: + """Constructs the hard-coded list of canonical steps.""" - def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec: - full = _format_full_name(phase, step) + def _spec(name: str, cls: type[SdmxStep]) -> StepSpec: return StepSpec( - phase=phase, - name=step, + name=name, version=cls.VERSION, - factory=lambda cfg: cls(name=full, config=cfg), + factory=lambda cfg: cls(name=name, config=cfg), ) - download = PhaseSpec( - name="download", - steps=[ - _spec("download", "download-data", DownloadDataStep), - _spec("download", "download-metadata", DownloadMetadataStep), - ], - ) - sample = PhaseSpec( - name="sample", - steps=[ - _spec("sample", "create-sample", CreateSampleStep), - ], - ) - schema_map = PhaseSpec( - name="schema_map", - steps=[ - _spec("schema_map", "create-schema-mapping", CreateSchemaMapStep), - ], - ) - transform = PhaseSpec( - name="transform", - steps=[ - _spec("transform", "process-full-data", ProcessFullDataStep), - _spec("transform", "create-dc-config", CreateDcConfigStep), - ], - ) - return PhaseRegistry(phases=[download, sample, schema_map, transform]) + return [ + _spec("download-data", DownloadDataStep), + _spec("download-metadata", DownloadMetadataStep), + _spec("create-sample", CreateSampleStep), + _spec("create-schema-mapping", CreateSchemaMapStep), + _spec("process-full-data", ProcessFullDataStep), + _spec("create-dc-config", CreateDcConfigStep), + ] def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, - registry: PhaseRegistry) -> Pipeline: - builder = PipelineBuilder(config=config, state=state, registry=registry) + steps: Sequence[StepSpec]) -> Pipeline: + builder = PipelineBuilder(config=config, state=state, steps=steps) return builder.build() - - -def _format_full_name(phase: str, step: str) -> str: - return f"{phase}.{step}" diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 0f26c61a7e..da13a52449 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -41,8 +41,8 @@ from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error CreateDcConfigStep, DownloadDataStep, ProcessFullDataStep, InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, - PhaseRegistry, PhaseSpec, StepSpec, SdmxStep, build_pipeline_callback, - build_registry, build_sdmx_pipeline) + StepSpec, SdmxStep, build_pipeline_callback, build_step_specs, + build_sdmx_pipeline) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -280,8 +280,8 @@ def test_interactive_mode_returns_composite(self) -> None: class PlanningTest(unittest.TestCase): - def _mk_registry(self) -> PhaseRegistry: - return build_registry() + def _mk_steps(self) -> list[StepSpec]: + return build_step_specs() def _empty_state(self) -> PipelineState: return PipelineState(run_id="demo", @@ -314,128 +314,100 @@ def _state_with( def _names_from_builder(self, cfg: PipelineConfig, - reg: PhaseRegistry, + steps: list[StepSpec], state: PipelineState | None = None) -> list[str]: builder = PipelineBuilder(config=cfg, state=state or self._empty_state(), - registry=reg) + steps=steps) pipeline = builder.build() return [step.name for step in pipeline.get_steps()] - def test_run_only_phase_and_step(self) -> None: - reg = self._mk_registry() - cfg_phase = PipelineConfig(run_only="download") - names_phase = self._names_from_builder(cfg_phase, reg) - self.assertEqual( - names_phase, - ["download.download-data", "download.download-metadata"]) + def test_run_only_step(self) -> None: + steps = self._mk_steps() + cfg_step = PipelineConfig(run_only="download-data") + names_step = self._names_from_builder(cfg_step, steps) + self.assertEqual(names_step, ["download-data"]) - cfg_step = PipelineConfig(run_only="download.download-data") - names_step = self._names_from_builder(cfg_step, reg) - self.assertEqual(names_step, ["download.download-data"]) - - with self.assertRaisesRegex(ValueError, "run_only phase not found"): - self._names_from_builder(PipelineConfig(run_only="nope"), reg) + with self.assertRaisesRegex(ValueError, "run_only step not found"): + self._names_from_builder(PipelineConfig(run_only="nope"), steps) with self.assertRaisesRegex(ValueError, "run_only step not found"): self._names_from_builder(PipelineConfig(run_only="download.nope"), - reg) + steps) def test_force_semantics(self) -> None: - reg = self._mk_registry() + steps = self._mk_steps() cfg_all = PipelineConfig(force=True) - names_all = self._names_from_builder(cfg_all, reg) + names_all = self._names_from_builder(cfg_all, steps) self.assertEqual(names_all, [ - "download.download-data", - "download.download-metadata", - "sample.create-sample", - "schema_map.create-schema-mapping", - "transform.process-full-data", - "transform.create-dc-config", + "download-data", + "download-metadata", + "create-sample", + "create-schema-mapping", + "process-full-data", + "create-dc-config", ]) - cfg_phase = PipelineConfig(run_only="download", force=True) - names_phase = self._names_from_builder(cfg_phase, reg) - self.assertEqual( - names_phase, - ["download.download-data", "download.download-metadata"]) - def test_timestamp_chaining_triggers_next_step(self) -> None: - reg = self._mk_registry() + steps = self._mk_steps() newer = 2_000 older = 1_000 state = self._state_with({ - "download.download-data": (1, "succeeded", newer), - "download.download-metadata": (1, "succeeded", older), - "sample.create-sample": (1, "succeeded", older), - "schema_map.create-schema-mapping": (1, "succeeded", older), - "transform.process-full-data": (1, "succeeded", older), - "transform.create-dc-config": (1, "succeeded", older), + "download-data": (1, "succeeded", newer), + "download-metadata": (1, "succeeded", older), + "create-sample": (1, "succeeded", older), + "create-schema-mapping": (1, "succeeded", older), + "process-full-data": (1, "succeeded", older), + "create-dc-config": (1, "succeeded", older), }) cfg = PipelineConfig() - names = self._names_from_builder(cfg, reg, state) + names = self._names_from_builder(cfg, steps, state) self.assertEqual(names, [ - "download.download-metadata", - "sample.create-sample", - "schema_map.create-schema-mapping", - "transform.process-full-data", - "transform.create-dc-config", + "download-metadata", + "create-sample", + "create-schema-mapping", + "process-full-data", + "create-dc-config", ]) def test_run_only_ignores_timestamp_chaining(self) -> None: - reg = self._mk_registry() + steps = self._mk_steps() newer = 4_000 older = 3_000 state = self._state_with({ - "download.download-data": (1, "succeeded", newer), - "download.download-metadata": (1, "succeeded", older), + "download-data": (1, "succeeded", newer), + "download-metadata": (1, "succeeded", older), }) - cfg = PipelineConfig(run_only="download") - names = self._names_from_builder(cfg, reg, state) - self.assertEqual( - names, ["download.download-data", "download.download-metadata"]) + cfg = PipelineConfig(run_only="download-data") + names = self._names_from_builder(cfg, steps, state) + self.assertEqual(names, ["download-data"]) def test_version_bump_schedules_downstream(self) -> None: - reg = PhaseRegistry(phases=[ - PhaseSpec(name="download", - steps=[ - StepSpec( - phase="download", - name="download-data", - version=1, - factory=lambda cfg: DownloadDataStep( - name="download.download-data", config=cfg)), - ]), - PhaseSpec(name="transform", - steps=[ - StepSpec(phase="transform", - name="process-full-data", - version=2, - factory=lambda cfg: ProcessFullDataStep( - name="transform.process-full-data", - config=cfg)), - StepSpec(phase="transform", - name="create-dc-config", - version=1, - factory=lambda cfg: CreateDcConfigStep( - name="transform.create-dc-config", - config=cfg)), - ]) - ]) + steps = [ + StepSpec(name="download-data", + version=1, + factory=lambda cfg: DownloadDataStep(name="download-data", + config=cfg)), + StepSpec(name="process-full-data", + version=2, + factory=lambda cfg: ProcessFullDataStep( + name="process-full-data", config=cfg)), + StepSpec(name="create-dc-config", + version=1, + factory=lambda cfg: CreateDcConfigStep( + name="create-dc-config", config=cfg)), + ] state = self._state_with({ - "download.download-data": (1, "succeeded", 1000), - "transform.process-full-data": (1, "succeeded", 1000), - "transform.create-dc-config": (1, "succeeded", 1000), + "download-data": (1, "succeeded", 1000), + "process-full-data": (1, "succeeded", 1000), + "create-dc-config": (1, "succeeded", 1000), }) cfg = PipelineConfig() - names = self._names_from_builder(cfg, reg, state) - self.assertEqual( - names, - ["transform.process-full-data", "transform.create-dc-config"]) - - pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg) - self.assertEqual( - [s.name for s in pipeline.get_steps()], - ["transform.process-full-data", "transform.create-dc-config"]) + names = self._names_from_builder(cfg, steps, state) + self.assertEqual(names, ["process-full-data", "create-dc-config"]) + + pipeline = build_sdmx_pipeline(config=cfg, state=state, steps=steps) + self.assertEqual([s.name for s in pipeline.get_steps()], + ["process-full-data", "create-dc-config"]) if __name__ == "__main__": From 475ba5bf8c47830bda252e755b8050d08bd00ddc Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Wed, 12 Nov 2025 12:08:22 +0000 Subject: [PATCH 13/54] refactor(sdmx-import): Simplify pipeline step definition and construction Removes the StepSpec dataclass and updates PipelineBuilder to directly accept Step objects. The build_step_specs function is replaced by build_steps, which directly instantiates Step objects. This simplifies the overall pipeline step definition and construction process. --- tools/agentic_import/sdmx_import_pipeline.py | 87 ++++++++----------- .../sdmx_import_pipeline_test.py | 56 ++++++------ 2 files changed, 62 insertions(+), 81 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 36b10d5e24..a6ef7d274f 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -273,70 +273,62 @@ def dry_run(self) -> None: logging.info(f"{self.name} (dry run): previewing DC config creation") -@dataclass(frozen=True) -class StepSpec: - name: str - version: int - factory: Callable[[PipelineConfig], Step] - - class PipelineBuilder: def __init__(self, *, config: PipelineConfig, state: PipelineState, - steps: Sequence[StepSpec]) -> None: + steps: Sequence[Step]) -> None: self._config = config self._state = state - self._specs = steps + self._steps = steps def build(self) -> Pipeline: planned = self._plan_steps() - steps = [spec.factory(self._config) for spec in planned] - logging.info("Built SDMX pipeline with %d steps", len(steps)) - return Pipeline(steps=steps) + logging.info("Built SDMX pipeline with %d steps", len(planned)) + return Pipeline(steps=planned) - def _plan_steps(self) -> list[StepSpec]: + def _plan_steps(self) -> list[Step]: if self._config.run_only: - return self._filter_run_only(self._specs, self._config.run_only) + return self._filter_run_only(self._steps, self._config.run_only) if self._config.force: - return list(self._specs) - scheduled: list[StepSpec] = [] + return list(self._steps) + scheduled: list[Step] = [] schedule_all_remaining = False - previous: StepSpec | None = None - for spec in self._specs: + previous: Step | None = None + for step in self._steps: if schedule_all_remaining: - scheduled.append(spec) + scheduled.append(step) else: - needs_run = self._should_run(spec) + needs_run = self._should_run(step) if not needs_run and previous is not None: - needs_run = self._predecessor_newer(previous, spec) + needs_run = self._predecessor_newer(previous, step) if needs_run: - scheduled.append(spec) + scheduled.append(step) schedule_all_remaining = True - previous = spec + previous = step if not scheduled: logging.info("No steps scheduled.") return scheduled - def _filter_run_only(self, specs: Sequence[StepSpec], - run_only: str) -> list[StepSpec]: - scoped = [s for s in specs if s.name == run_only] + def _filter_run_only(self, steps: Sequence[Step], + run_only: str) -> list[Step]: + scoped = [s for s in steps if s.name == run_only] if not scoped: raise ValueError(f"run_only step not found: {run_only}") return scoped - def _should_run(self, spec: StepSpec) -> bool: - prev = self._state.steps.get(spec.name) + def _should_run(self, step: Step) -> bool: + prev = self._state.steps.get(step.name) if prev is None: return True if prev.status != "succeeded": return True - if prev.version < spec.version: + if prev.version < step.version: return True return False - def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool: - prev_state = self._state.steps.get(prev_spec.name) - curr_state = self._state.steps.get(spec.name) + def _predecessor_newer(self, prev_step: Step, step: Step) -> bool: + prev_state = self._state.steps.get(prev_step.name) + curr_state = self._state.steps.get(step.name) if prev_state is None or prev_state.ended_at_ts is None: return False if curr_state is None: @@ -348,27 +340,22 @@ def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool: return prev_state.ended_at_ts > curr_state.ended_at_ts -def build_step_specs() -> list[StepSpec]: +def build_steps(config: PipelineConfig) -> list[Step]: """Constructs the hard-coded list of canonical steps.""" - - def _spec(name: str, cls: type[SdmxStep]) -> StepSpec: - return StepSpec( - name=name, - version=cls.VERSION, - factory=lambda cfg: cls(name=name, config=cfg), - ) - return [ - _spec("download-data", DownloadDataStep), - _spec("download-metadata", DownloadMetadataStep), - _spec("create-sample", CreateSampleStep), - _spec("create-schema-mapping", CreateSchemaMapStep), - _spec("process-full-data", ProcessFullDataStep), - _spec("create-dc-config", CreateDcConfigStep), + DownloadDataStep(name="download-data", config=config), + DownloadMetadataStep(name="download-metadata", config=config), + CreateSampleStep(name="create-sample", config=config), + CreateSchemaMapStep(name="create-schema-mapping", config=config), + ProcessFullDataStep(name="process-full-data", config=config), + CreateDcConfigStep(name="create-dc-config", config=config), ] -def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, - steps: Sequence[StepSpec]) -> Pipeline: - builder = PipelineBuilder(config=config, state=state, steps=steps) +def build_sdmx_pipeline(*, + config: PipelineConfig, + state: PipelineState, + steps: Sequence[Step] | None = None) -> Pipeline: + builder_steps = steps if steps is not None else build_steps(config) + builder = PipelineBuilder(config=config, state=state, steps=builder_steps) return builder.build() diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index da13a52449..b6344933bb 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -39,10 +39,8 @@ RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error - CreateDcConfigStep, DownloadDataStep, ProcessFullDataStep, InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, - StepSpec, SdmxStep, build_pipeline_callback, build_step_specs, - build_sdmx_pipeline) + build_pipeline_callback, build_sdmx_pipeline, build_steps) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -76,6 +74,18 @@ def dry_run(self) -> None: logging.info("noop") +class _VersionedStep(BaseStep): + + def __init__(self, name: str, version: int) -> None: + super().__init__(name=name, version=version) + + def run(self) -> None: + logging.info("noop") + + def dry_run(self) -> None: + logging.info("noop") + + class JSONStateCallbackTest(unittest.TestCase): def _build_callback( @@ -280,9 +290,6 @@ def test_interactive_mode_returns_composite(self) -> None: class PlanningTest(unittest.TestCase): - def _mk_steps(self) -> list[StepSpec]: - return build_step_specs() - def _empty_state(self) -> PipelineState: return PipelineState(run_id="demo", critical_input_hash="", @@ -314,30 +321,28 @@ def _state_with( def _names_from_builder(self, cfg: PipelineConfig, - steps: list[StepSpec], + steps: list[BaseStep] | None = None, state: PipelineState | None = None) -> list[str]: + builder_steps = steps or build_steps(cfg) builder = PipelineBuilder(config=cfg, state=state or self._empty_state(), - steps=steps) + steps=builder_steps) pipeline = builder.build() return [step.name for step in pipeline.get_steps()] def test_run_only_step(self) -> None: - steps = self._mk_steps() cfg_step = PipelineConfig(run_only="download-data") - names_step = self._names_from_builder(cfg_step, steps) + names_step = self._names_from_builder(cfg_step) self.assertEqual(names_step, ["download-data"]) with self.assertRaisesRegex(ValueError, "run_only step not found"): - self._names_from_builder(PipelineConfig(run_only="nope"), steps) + self._names_from_builder(PipelineConfig(run_only="nope")) with self.assertRaisesRegex(ValueError, "run_only step not found"): - self._names_from_builder(PipelineConfig(run_only="download.nope"), - steps) + self._names_from_builder(PipelineConfig(run_only="download.nope")) def test_force_semantics(self) -> None: - steps = self._mk_steps() cfg_all = PipelineConfig(force=True) - names_all = self._names_from_builder(cfg_all, steps) + names_all = self._names_from_builder(cfg_all) self.assertEqual(names_all, [ "download-data", "download-metadata", @@ -348,7 +353,6 @@ def test_force_semantics(self) -> None: ]) def test_timestamp_chaining_triggers_next_step(self) -> None: - steps = self._mk_steps() newer = 2_000 older = 1_000 state = self._state_with({ @@ -360,7 +364,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: "create-dc-config": (1, "succeeded", older), }) cfg = PipelineConfig() - names = self._names_from_builder(cfg, steps, state) + names = self._names_from_builder(cfg, state=state) self.assertEqual(names, [ "download-metadata", "create-sample", @@ -370,7 +374,6 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: ]) def test_run_only_ignores_timestamp_chaining(self) -> None: - steps = self._mk_steps() newer = 4_000 older = 3_000 state = self._state_with({ @@ -378,23 +381,14 @@ def test_run_only_ignores_timestamp_chaining(self) -> None: "download-metadata": (1, "succeeded", older), }) cfg = PipelineConfig(run_only="download-data") - names = self._names_from_builder(cfg, steps, state) + names = self._names_from_builder(cfg, state=state) self.assertEqual(names, ["download-data"]) def test_version_bump_schedules_downstream(self) -> None: steps = [ - StepSpec(name="download-data", - version=1, - factory=lambda cfg: DownloadDataStep(name="download-data", - config=cfg)), - StepSpec(name="process-full-data", - version=2, - factory=lambda cfg: ProcessFullDataStep( - name="process-full-data", config=cfg)), - StepSpec(name="create-dc-config", - version=1, - factory=lambda cfg: CreateDcConfigStep( - name="create-dc-config", config=cfg)), + _VersionedStep("download-data", 1), + _VersionedStep("process-full-data", 2), + _VersionedStep("create-dc-config", 1), ] state = self._state_with({ "download-data": (1, "succeeded", 1000), From 24d0e0cee3039f16c981506263c918a0bf61fd8e Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 13 Nov 2025 02:46:39 +0000 Subject: [PATCH 14/54] feat(sdmx-import): add run_sdmx_pipeline orchestration resolve dataset prefix/hash, working dir, and state metadata before running update state handler/tests to expect dataset_prefix and command --- tools/agentic_import/sdmx_import_pipeline.py | 90 ++++++++++++- .../sdmx_import_pipeline_test.py | 124 +++++++++++++++--- tools/agentic_import/state_handler.py | 9 +- tools/agentic_import/state_handler_test.py | 4 +- 4 files changed, 201 insertions(+), 26 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index a6ef7d274f..c6946a4e49 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -15,15 +15,20 @@ from __future__ import annotations +import hashlib +import json +import os +import re from dataclasses import dataclass from datetime import datetime, timezone +from pathlib import Path from typing import Callable, Sequence from absl import logging from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, PipelineAbort, PipelineCallback, - Step) + PipelineRunner, RunnerConfig, Step) from tools.agentic_import.state_handler import (PipelineState, StateHandler, StepState) @@ -57,14 +62,14 @@ class JSONStateCallback(PipelineCallback): def __init__(self, *, state_handler: StateHandler, - run_id: str, + dataset_prefix: str, critical_input_hash: str, command: str, now_fn: Callable[[], datetime] | None = None) -> None: self._handler = state_handler self._now_fn = now_fn or (lambda: datetime.now(timezone.utc)) self._state = self._handler.get_state() - self._state.run_id = run_id + self._state.dataset_prefix = dataset_prefix self._state.critical_input_hash = critical_input_hash self._state.command = command self._step_start_times: dict[str, datetime] = {} @@ -115,7 +120,7 @@ def _now(self) -> datetime: def build_pipeline_callback( *, state_handler: StateHandler, - run_id: str, + dataset_prefix: str, critical_input_hash: str, command: str, skip_confirmation: bool, @@ -123,7 +128,7 @@ def build_pipeline_callback( ) -> PipelineCallback: """Constructs the pipeline callback stack for the SDMX runner.""" json_callback = JSONStateCallback(state_handler=state_handler, - run_id=run_id, + dataset_prefix=dataset_prefix, critical_input_hash=critical_input_hash, command=command, now_fn=now_fn) @@ -141,6 +146,7 @@ class PipelineConfig: phase. Defaults are intentionally minimal. """ + command: str endpoint: str | None = None agency: str | None = None dataflow: str | None = None @@ -359,3 +365,77 @@ def build_sdmx_pipeline(*, builder_steps = steps if steps is not None else build_steps(config) builder = PipelineBuilder(config=config, state=state, steps=builder_steps) return builder.build() + + +def _sanitize_run_id(dataflow: str) -> str: + normalized = dataflow.lower() + normalized = re.sub(r"[^a-z0-9_]+", "_", normalized) + normalized = re.sub(r"_+", "_", normalized) + return normalized.strip("_") + + +def _resolve_dataset_prefix(config: PipelineConfig) -> str: + if config.dataset_prefix: + return config.dataset_prefix + if not config.dataflow: + raise ValueError( + "dataflow or dataset_prefix is required to derive dataset prefix") + sanitized = _sanitize_run_id(config.dataflow) + if not sanitized: + raise ValueError("dataflow value is invalid after sanitization") + return sanitized + + +def _compute_critical_input_hash(config: PipelineConfig) -> str: + payload = { + "agency": config.agency, + "dataflow": config.dataflow, + "endpoint": config.endpoint, + "key": config.key, + } + serialized = json.dumps(payload, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(serialized.encode("utf-8")).hexdigest() + + +def _resolve_working_dir(config: PipelineConfig) -> Path: + directory = Path(config.working_dir or os.getcwd()) + if directory.exists(): + if not directory.is_dir(): + raise ValueError(f"working_dir is not a directory: {directory}") + else: + directory.mkdir(parents=True, exist_ok=True) + return directory + + +def run_sdmx_pipeline( + *, + config: PipelineConfig, + now_fn: Callable[[], datetime] | None = None, +) -> None: + """Orchestrates the SDMX pipeline for the provided configuration.""" + working_dir = _resolve_working_dir(config) + dataset_prefix = _resolve_dataset_prefix(config) + state_handler = StateHandler( + state_path=working_dir / ".datacommons" / + f"{dataset_prefix}.state.json", + dataset_prefix=dataset_prefix, + ) + state = state_handler.get_state() + critical_hash = _compute_critical_input_hash(config) + state.dataset_prefix = dataset_prefix + state.command = config.command + state.critical_input_hash = critical_hash + state_handler.save_state() + pipeline = build_sdmx_pipeline(config=config, state=state) + callback = build_pipeline_callback( + state_handler=state_handler, + dataset_prefix=dataset_prefix, + critical_input_hash=critical_hash, + command=config.command, + skip_confirmation=config.skip_confirmation, + now_fn=now_fn, + ) + if config.verbose: + logging.set_verbosity(logging.DEBUG) + runner = PipelineRunner(RunnerConfig()) + runner.run(pipeline, callback) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index b6344933bb..4c6a35db86 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -17,12 +17,15 @@ from __future__ import annotations +import hashlib +import dataclasses import json import os import sys import tempfile import unittest from datetime import datetime, timedelta, timezone +from pathlib import Path from unittest import mock from absl import logging @@ -34,13 +37,16 @@ if path not in sys.path: sys.path.append(path) +_TEST_COMMAND = "sdmx pipeline test" + from tools.agentic_import.pipeline import ( # pylint: disable=import-error BaseStep, CompositeCallback, Pipeline, PipelineAbort, PipelineRunner, RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, - build_pipeline_callback, build_sdmx_pipeline, build_steps) + build_pipeline_callback, build_sdmx_pipeline, build_steps, + run_sdmx_pipeline) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -95,7 +101,7 @@ def _build_callback( handler = StateHandler(state_path=state_path, dataset_prefix="demo") callback = JSONStateCallback( state_handler=handler, - run_id="demo", + dataset_prefix="demo", critical_input_hash="abc123", command="python run", now_fn=clock, @@ -117,7 +123,7 @@ def test_successful_step_persists_expected_schema(self) -> None: state = json.load(fp) step_state = state["steps"]["download.download-data"] - self.assertEqual(state["run_id"], "demo") + self.assertEqual(state["dataset_prefix"], "demo") self.assertEqual(state["critical_input_hash"], "abc123") self.assertEqual(step_state["status"], "succeeded") self.assertIn("started_at", step_state) @@ -167,7 +173,7 @@ def test_abort_skips_state_persistence(self) -> None: os.makedirs(state_dir, exist_ok=True) state_path = os.path.join(state_dir, "demo.state.json") previous = { - "run_id": "previous", + "dataset_prefix": "previous", "critical_input_hash": "old", "command": "old command", "updated_at": "2025-01-01T00:00:00Z", @@ -267,7 +273,7 @@ def test_skip_confirmation_returns_json_callback(self) -> None: handler = self._state_handler_for_tmpdir(tmpdir) callback = build_pipeline_callback( state_handler=handler, - run_id="demo", + dataset_prefix="demo", critical_input_hash="abc", command="python run", skip_confirmation=True, @@ -280,7 +286,7 @@ def test_interactive_mode_returns_composite(self) -> None: with mock.patch("builtins.input", return_value="y"): callback = build_pipeline_callback( state_handler=handler, - run_id="demo", + dataset_prefix="demo", critical_input_hash="abc", command="python run", skip_confirmation=False, @@ -291,7 +297,7 @@ def test_interactive_mode_returns_composite(self) -> None: class PlanningTest(unittest.TestCase): def _empty_state(self) -> PipelineState: - return PipelineState(run_id="demo", + return PipelineState(dataset_prefix="demo", critical_input_hash="", command="", updated_at="", @@ -312,7 +318,7 @@ def _state_with( message=None) for name, (v, st, ts) in versions.items() } - return PipelineState(run_id="demo", + return PipelineState(dataset_prefix="demo", critical_input_hash="", command="", updated_at="", @@ -331,17 +337,20 @@ def _names_from_builder(self, return [step.name for step in pipeline.get_steps()] def test_run_only_step(self) -> None: - cfg_step = PipelineConfig(run_only="download-data") + cfg_step = PipelineConfig(command=_TEST_COMMAND, + run_only="download-data") names_step = self._names_from_builder(cfg_step) self.assertEqual(names_step, ["download-data"]) with self.assertRaisesRegex(ValueError, "run_only step not found"): - self._names_from_builder(PipelineConfig(run_only="nope")) + self._names_from_builder( + PipelineConfig(command=_TEST_COMMAND, run_only="nope")) with self.assertRaisesRegex(ValueError, "run_only step not found"): - self._names_from_builder(PipelineConfig(run_only="download.nope")) + self._names_from_builder( + PipelineConfig(command=_TEST_COMMAND, run_only="download.nope")) def test_force_semantics(self) -> None: - cfg_all = PipelineConfig(force=True) + cfg_all = PipelineConfig(command=_TEST_COMMAND, force=True) names_all = self._names_from_builder(cfg_all) self.assertEqual(names_all, [ "download-data", @@ -363,7 +372,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: "process-full-data": (1, "succeeded", older), "create-dc-config": (1, "succeeded", older), }) - cfg = PipelineConfig() + cfg = PipelineConfig(command=_TEST_COMMAND) names = self._names_from_builder(cfg, state=state) self.assertEqual(names, [ "download-metadata", @@ -380,7 +389,7 @@ def test_run_only_ignores_timestamp_chaining(self) -> None: "download-data": (1, "succeeded", newer), "download-metadata": (1, "succeeded", older), }) - cfg = PipelineConfig(run_only="download-data") + cfg = PipelineConfig(command=_TEST_COMMAND, run_only="download-data") names = self._names_from_builder(cfg, state=state) self.assertEqual(names, ["download-data"]) @@ -395,7 +404,7 @@ def test_version_bump_schedules_downstream(self) -> None: "process-full-data": (1, "succeeded", 1000), "create-dc-config": (1, "succeeded", 1000), }) - cfg = PipelineConfig() + cfg = PipelineConfig(command=_TEST_COMMAND) names = self._names_from_builder(cfg, steps, state) self.assertEqual(names, ["process-full-data", "create-dc-config"]) @@ -404,5 +413,90 @@ def test_version_bump_schedules_downstream(self) -> None: ["process-full-data", "create-dc-config"]) +class RunPipelineTest(unittest.TestCase): + + def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None, + command: str) -> PipelineConfig: + return PipelineConfig(endpoint="https://api.example.com", + agency="TEST_AGENCY", + dataflow=dataflow, + key="test-key", + dataset_prefix=dataset_prefix, + working_dir=self._tmpdir, + skip_confirmation=True, + command=command) + + def setUp(self) -> None: + self._tmpdir_obj = tempfile.TemporaryDirectory() + self.addCleanup(self._tmpdir_obj.cleanup) + self._tmpdir = self._tmpdir_obj.name + + def test_run_pipeline_updates_state_and_hash(self) -> None: + command = "sdmx run pipeline" + config = self._build_config(dataset_prefix="demo", + dataflow="df.1", + command=command) + clock = _IncrementingClock(datetime(2025, 1, 2, tzinfo=timezone.utc), + timedelta(seconds=2)) + + run_sdmx_pipeline(config=config, now_fn=clock) + + state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" + self.assertTrue(state_path.exists()) + with state_path.open(encoding="utf-8") as fp: + state = json.load(fp) + + expected_hash = hashlib.sha256( + json.dumps( + { + "agency": config.agency, + "dataflow": config.dataflow, + "endpoint": config.endpoint, + "key": config.key, + }, + sort_keys=True, + separators=(",", ":")).encode("utf-8")).hexdigest() + + self.assertEqual(state["dataset_prefix"], "demo") + self.assertEqual(state["command"], command) + self.assertEqual(state["critical_input_hash"], expected_hash) + self.assertEqual(len(state["steps"]), 6) + + for step_name in [ + "download-data", "download-metadata", "create-sample", + "create-schema-mapping", "process-full-data", "create-dc-config" + ]: + self.assertIn(step_name, state["steps"]) + self.assertEqual(state["steps"][step_name]["status"], "succeeded") + + def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: + dataflow = "My Flow-Name 2025!!!" + config = self._build_config(dataset_prefix=None, + dataflow=dataflow, + command="sdmx run sanitized") + run_sdmx_pipeline(config=config, + now_fn=_IncrementingClock( + datetime(2025, 1, 3, tzinfo=timezone.utc), + timedelta(seconds=2))) + + expected_run_id = "my_flow_name_2025" + state_path = Path( + self._tmpdir) / ".datacommons" / f"{expected_run_id}.state.json" + self.assertTrue(state_path.exists()) + with state_path.open(encoding="utf-8") as fp: + state = json.load(fp) + self.assertEqual(state["dataset_prefix"], expected_run_id) + + def test_invalid_working_dir_raises(self) -> None: + path = Path(self._tmpdir) / "not_a_dir" + path.write_text("content") + config = dataclasses.replace(self._build_config( + dataset_prefix="demo", dataflow="df", command="sdmx run invalid"), + working_dir=str(path)) + with self.assertRaisesRegex(ValueError, + "working_dir is not a directory"): + run_sdmx_pipeline(config=config) + + if __name__ == "__main__": unittest.main() diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py index f4d2af50b0..31dabccc1f 100644 --- a/tools/agentic_import/state_handler.py +++ b/tools/agentic_import/state_handler.py @@ -45,7 +45,7 @@ class StepState: @dataclass_json @dataclass class PipelineState: - run_id: str + dataset_prefix: str critical_input_hash: str command: str updated_at: str @@ -87,8 +87,9 @@ def _load_or_init(self) -> PipelineState: with path.open("r", encoding="utf-8") as fp: data = json.load(fp) state = PipelineState.from_dict(data) - if not state.run_id: - state.run_id = self._dataset_prefix + if not state.dataset_prefix: + # Ensure a manual or corrupted state file still has prefix metadata. + state.dataset_prefix = self._dataset_prefix return state except (OSError, json.JSONDecodeError, ValueError, TypeError) as exc: logging.warning(f"Failed to load state file {path}: {exc}") @@ -107,7 +108,7 @@ def _write_state(self, state: PipelineState) -> None: def _empty_state(self) -> PipelineState: return PipelineState( - run_id=self._dataset_prefix, + dataset_prefix=self._dataset_prefix, critical_input_hash="", command="", updated_at="", diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py index 3d3ad3a1d8..c000260f6d 100644 --- a/tools/agentic_import/state_handler_test.py +++ b/tools/agentic_import/state_handler_test.py @@ -40,12 +40,12 @@ def test_missing_file_creates_empty_state(self) -> None: state = handler.get_state() self.assertTrue(os.path.exists(path)) - self.assertEqual(state.run_id, "demo") + self.assertEqual(state.dataset_prefix, "demo") self.assertEqual(state.steps, {}) with open(path, encoding="utf-8") as fp: data = json.load(fp) - self.assertEqual(data["run_id"], "demo") + self.assertEqual(data["dataset_prefix"], "demo") self.assertEqual(data["steps"], {}) self.assertIsNone(data["updated_at_ts"]) From c41eb3f6172e7065bd4c06cadc05de4e5fab22d5 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 13 Nov 2025 04:27:55 +0000 Subject: [PATCH 15/54] feat(sdmx-import): add CLI entrypoint for SDMX pipeline Map absl flags into PipelineConfig and add a runnable main. --- tools/agentic_import/sdmx_import_pipeline.py | 78 +++++++++++++++++-- .../sdmx_import_pipeline_test.py | 6 +- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index c6946a4e49..4714f1117c 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -19,12 +19,14 @@ import json import os import re +import shlex +import sys from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Callable, Sequence -from absl import logging +from absl import app, flags, logging from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, PipelineAbort, PipelineCallback, @@ -32,6 +34,40 @@ from tools.agentic_import.state_handler import (PipelineState, StateHandler, StepState) +FLAGS = flags.FLAGS + + +def _define_flags() -> None: + flags.DEFINE_string("endpoint", None, "SDMX service endpoint.") + flags.mark_flag_as_required("endpoint") + + flags.DEFINE_string("agency", None, "Owning SDMX agency identifier.") + flags.mark_flag_as_required("agency") + + flags.DEFINE_string("dataflow", None, "Target SDMX dataflow identifier.") + flags.mark_flag_as_required("dataflow") + + flags.DEFINE_string("dataflow_key", None, "Optional SDMX key or filter.") + flags.DEFINE_alias("key", "dataflow_key") + + flags.DEFINE_string( + "dataflow_param", None, + "Optional SDMX parameter appended to the dataflow query.") + + flags.DEFINE_string( + "dataset_prefix", None, + "Optional dataset prefix to override auto-derived values.") + + flags.DEFINE_string("run_only", None, + "Execute only a specific pipeline step by name.") + + flags.DEFINE_boolean("force", False, "Force all steps to run.") + + flags.DEFINE_boolean("verbose", False, "Enable verbose logging.") + + flags.DEFINE_boolean("skip_confirmation", False, + "Skip interactive confirmation prompts.") + def _format_time(value: datetime) -> str: if value.tzinfo is None: @@ -150,15 +186,15 @@ class PipelineConfig: endpoint: str | None = None agency: str | None = None dataflow: str | None = None - key: str | None = None + dataflow_key: str | None = None + dataflow_param: str | None = None dataset_prefix: str | None = None - working_dir: str | None = None + working_dir: str | None = None # TODO: Add CLI flag once semantics stabilize. run_only: str | None = None force: bool = False verbose: bool = False skip_confirmation: bool = False - class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" @@ -391,7 +427,8 @@ def _compute_critical_input_hash(config: PipelineConfig) -> str: "agency": config.agency, "dataflow": config.dataflow, "endpoint": config.endpoint, - "key": config.key, + "dataflow_key": config.dataflow_key, + "dataflow_param": config.dataflow_param, } serialized = json.dumps(payload, sort_keys=True, separators=(",", ":")) return hashlib.sha256(serialized.encode("utf-8")).hexdigest() @@ -439,3 +476,34 @@ def run_sdmx_pipeline( logging.set_verbosity(logging.DEBUG) runner = PipelineRunner(RunnerConfig()) runner.run(pipeline, callback) + + +def prepare_config() -> PipelineConfig: + """Builds PipelineConfig from CLI flags.""" + command = shlex.join(sys.argv) if sys.argv else "python" + return PipelineConfig( + command=command, + endpoint=FLAGS.endpoint, + agency=FLAGS.agency, + dataflow=FLAGS.dataflow, + dataflow_key=FLAGS.dataflow_key, + dataflow_param=FLAGS.dataflow_param, + dataset_prefix=FLAGS.dataset_prefix, + working_dir=None, + run_only=FLAGS.run_only, + force=FLAGS.force, + verbose=FLAGS.verbose, + skip_confirmation=FLAGS.skip_confirmation, + ) + + +def main(_: list[str]) -> int: + config = prepare_config() + logging.info(f"SDMX pipeline configuration: {config}") + run_sdmx_pipeline(config=config) + return 0 + + +if __name__ == "__main__": + _define_flags() + app.run(main) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 4c6a35db86..2007e854cb 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -420,7 +420,8 @@ def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None, return PipelineConfig(endpoint="https://api.example.com", agency="TEST_AGENCY", dataflow=dataflow, - key="test-key", + dataflow_key="test-key", + dataflow_param="area=US", dataset_prefix=dataset_prefix, working_dir=self._tmpdir, skip_confirmation=True, @@ -452,7 +453,8 @@ def test_run_pipeline_updates_state_and_hash(self) -> None: "agency": config.agency, "dataflow": config.dataflow, "endpoint": config.endpoint, - "key": config.key, + "dataflow_key": config.dataflow_key, + "dataflow_param": config.dataflow_param, }, sort_keys=True, separators=(",", ":")).encode("utf-8")).hexdigest() From 4be87a72b0722d6fea0e4b3cc8b5c8150d674d1a Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Fri, 14 Nov 2025 04:48:28 +0000 Subject: [PATCH 16/54] refactor: centralize SDMX pipeline planning move hash comparison into PipelineBuilder, snapshot state before planning, and add tests covering rerun vs no-op flows --- tools/agentic_import/sdmx_import_pipeline.py | 52 ++++++++++++++----- .../sdmx_import_pipeline_test.py | 47 +++++++++++++++++ 2 files changed, 85 insertions(+), 14 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 4714f1117c..aad7d6e0de 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -15,6 +15,7 @@ from __future__ import annotations +import copy import hashlib import json import os @@ -195,6 +196,7 @@ class PipelineConfig: verbose: bool = False skip_confirmation: bool = False + class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" @@ -317,22 +319,32 @@ def dry_run(self) -> None: class PipelineBuilder: - def __init__(self, *, config: PipelineConfig, state: PipelineState, - steps: Sequence[Step]) -> None: + def __init__(self, + *, + config: PipelineConfig, + state: PipelineState, + steps: Sequence[Step], + critical_input_hash: str | None = None) -> None: self._config = config self._state = state self._steps = steps + self._critical_input_hash = critical_input_hash def build(self) -> Pipeline: - planned = self._plan_steps() + if self._config.run_only: + planned = self._filter_run_only(self._steps, self._config.run_only) + elif self._config.force: + logging.info("Force flag set; scheduling all SDMX steps") + planned = list(self._steps) + elif self._hash_changed(): + logging.info("Critical inputs changed; scheduling all SDMX steps") + planned = list(self._steps) + else: + planned = self._plan_steps() logging.info("Built SDMX pipeline with %d steps", len(planned)) return Pipeline(steps=planned) def _plan_steps(self) -> list[Step]: - if self._config.run_only: - return self._filter_run_only(self._steps, self._config.run_only) - if self._config.force: - return list(self._steps) scheduled: list[Step] = [] schedule_all_remaining = False previous: Step | None = None @@ -358,6 +370,14 @@ def _filter_run_only(self, steps: Sequence[Step], raise ValueError(f"run_only step not found: {run_only}") return scoped + def _hash_changed(self) -> bool: + if not self._critical_input_hash: + return False + previous = self._state.critical_input_hash + if not previous: + return True + return previous != self._critical_input_hash + def _should_run(self, step: Step) -> bool: prev = self._state.steps.get(step.name) if prev is None: @@ -397,9 +417,13 @@ def build_steps(config: PipelineConfig) -> list[Step]: def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, - steps: Sequence[Step] | None = None) -> Pipeline: + steps: Sequence[Step] | None = None, + critical_input_hash: str | None = None) -> Pipeline: builder_steps = steps if steps is not None else build_steps(config) - builder = PipelineBuilder(config=config, state=state, steps=builder_steps) + builder = PipelineBuilder(config=config, + state=state, + steps=builder_steps, + critical_input_hash=critical_input_hash) return builder.build() @@ -458,12 +482,12 @@ def run_sdmx_pipeline( dataset_prefix=dataset_prefix, ) state = state_handler.get_state() + # Snapshot state for planning so callback mutations do not affect scheduling. + state_snapshot = copy.deepcopy(state) critical_hash = _compute_critical_input_hash(config) - state.dataset_prefix = dataset_prefix - state.command = config.command - state.critical_input_hash = critical_hash - state_handler.save_state() - pipeline = build_sdmx_pipeline(config=config, state=state) + pipeline = build_sdmx_pipeline(config=config, + state=state_snapshot, + critical_input_hash=critical_hash) callback = build_pipeline_callback( state_handler=state_handler, dataset_prefix=dataset_prefix, diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 2007e854cb..6e0f77ec1c 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -499,6 +499,53 @@ def test_invalid_working_dir_raises(self) -> None: "working_dir is not a directory"): run_sdmx_pipeline(config=config) + def test_hash_change_forces_full_rerun(self) -> None: + config = self._build_config(dataset_prefix="demo", + dataflow="df.2", + command="sdmx rerun force") + first_clock = _IncrementingClock( + datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1)) + run_sdmx_pipeline(config=config, now_fn=first_clock) + + state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" + with state_path.open(encoding="utf-8") as fp: + first_state = json.load(fp) + + updated_config = dataclasses.replace(config, dataflow_key="changed-key") + second_clock = _IncrementingClock( + datetime(2025, 1, 5, tzinfo=timezone.utc), timedelta(seconds=1)) + run_sdmx_pipeline(config=updated_config, now_fn=second_clock) + + with state_path.open(encoding="utf-8") as fp: + second_state = json.load(fp) + + self.assertNotEqual(first_state["critical_input_hash"], + second_state["critical_input_hash"]) + self.assertGreater( + second_state["steps"]["download-data"]["ended_at_ts"], + first_state["steps"]["download-data"]["ended_at_ts"]) + + def test_hash_unchanged_skips_rerun(self) -> None: + config = self._build_config(dataset_prefix="demo", + dataflow="df.3", + command="sdmx rerun noop") + initial_clock = _IncrementingClock( + datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1)) + run_sdmx_pipeline(config=config, now_fn=initial_clock) + + state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" + with state_path.open(encoding="utf-8") as fp: + first_state = json.load(fp) + + later_clock = _IncrementingClock( + datetime(2025, 1, 7, tzinfo=timezone.utc), timedelta(seconds=1)) + run_sdmx_pipeline(config=config, now_fn=later_clock) + + with state_path.open(encoding="utf-8") as fp: + second_state = json.load(fp) + + self.assertEqual(first_state, second_state) + if __name__ == "__main__": unittest.main() From 69831c23099e0cb30b8573cc8c0e9554949531ff Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Fri, 14 Nov 2025 05:08:33 +0000 Subject: [PATCH 17/54] fix: add repo root to sys.path Ensure sdmx pipeline can run via python tools/ without import errors. --- tools/agentic_import/sdmx_import_pipeline.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index aad7d6e0de..f28b2fc7da 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -29,6 +29,10 @@ from absl import app, flags, logging +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, PipelineAbort, PipelineCallback, PipelineRunner, RunnerConfig, Step) From 95214cb29104f3820113357616d7b005d8b780ff Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Fri, 14 Nov 2025 10:32:48 +0000 Subject: [PATCH 18/54] chore: log SDMX pipeline step outcomes Track per-step decisions so build/rerun reasons are recorded. --- tools/agentic_import/pipeline.py | 12 +- tools/agentic_import/sdmx_import_pipeline.py | 165 ++++++++++++++---- .../sdmx_import_pipeline_test.py | 36 +++- 3 files changed, 171 insertions(+), 42 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index 6c489e89a4..bf4890b9de 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -121,7 +121,7 @@ def run(self, try: for step in steps: current_step = step - logging.info(f"Preparing step {step.name} (v{step.version})") + logging.info(f"[STEP START] {step.name} (v{step.version})") if callback: callback.before_step(step) error: Exception | None = None @@ -129,13 +129,17 @@ def run(self, step.run() except Exception as exc: # pylint: disable=broad-except error = exc - logging.exception(f"Step {step.name} failed") + logging.exception( + f"[STEP END] {step.name} (v{step.version}) status=failed" + ) raise finally: if callback: callback.after_step(step, error=error) - logging.info(f"Finished step {step.name}") + logging.info( + f"[STEP END] {step.name} (v{step.version}) status=succeeded" + ) logging.info("Pipeline completed") except PipelineAbort: name = current_step.name if current_step else "" - logging.info(f"Pipeline aborted at {name}") + logging.info(f"[STEP END] {name} status=aborted; pipeline aborted") diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index f28b2fc7da..8f419dbf3c 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -25,7 +25,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Callable, Sequence +from typing import Callable, ClassVar, Sequence from absl import app, flags, logging @@ -201,6 +201,26 @@ class PipelineConfig: skip_confirmation: bool = False +@dataclass(frozen=True) +class StepDecision: + """Represents whether a step will run and why.""" + + RUN: ClassVar[str] = "RUN" + SKIP: ClassVar[str] = "SKIP" + + step_name: str + decision: str + reason: str + + +@dataclass(frozen=True) +class BuildResult: + """Output of planning that includes the pipeline and per-step decisions.""" + + pipeline: Pipeline + decisions: list[StepDecision] + + class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" @@ -334,45 +354,120 @@ def __init__(self, self._steps = steps self._critical_input_hash = critical_input_hash - def build(self) -> Pipeline: + def build(self) -> BuildResult: if self._config.run_only: - planned = self._filter_run_only(self._steps, self._config.run_only) + planned, decisions = self._plan_run_only(self._config.run_only) elif self._config.force: logging.info("Force flag set; scheduling all SDMX steps") - planned = list(self._steps) + planned, decisions = self._plan_all_steps( + "Force flag set; scheduling this step") elif self._hash_changed(): logging.info("Critical inputs changed; scheduling all SDMX steps") - planned = list(self._steps) + planned, decisions = self._plan_all_steps( + "Critical inputs changed; scheduling this step") else: - planned = self._plan_steps() + planned, decisions = self._plan_incremental() logging.info("Built SDMX pipeline with %d steps", len(planned)) - return Pipeline(steps=planned) + return BuildResult(pipeline=Pipeline(steps=planned), + decisions=decisions) + + def _plan_run_only(self, + run_only: str) -> tuple[list[Step], list[StepDecision]]: + planned: list[Step] = [] + decisions: list[StepDecision] = [] + for step in self._steps: + if step.name == run_only: + planned.append(step) + decisions.append( + StepDecision( + step_name=step.name, + decision=StepDecision.RUN, + reason=(f"run_only={run_only} requested; running only " + "this step"), + )) + else: + decisions.append( + StepDecision( + step_name=step.name, + decision=StepDecision.SKIP, + reason=(f"run_only={run_only} requested; skipping " + "this step"), + )) + if not planned: + raise ValueError(f"run_only step not found: {run_only}") + return planned, decisions - def _plan_steps(self) -> list[Step]: - scheduled: list[Step] = [] + def _plan_all_steps(self, + reason: str) -> tuple[list[Step], list[StepDecision]]: + planned: list[Step] = [] + decisions: list[StepDecision] = [] + for step in self._steps: + planned.append(step) + decisions.append( + StepDecision(step_name=step.name, + decision=StepDecision.RUN, + reason=reason)) + return planned, decisions + + def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]: + planned: list[Step] = [] + decisions: list[StepDecision] = [] schedule_all_remaining = False previous: Step | None = None for step in self._steps: if schedule_all_remaining: - scheduled.append(step) + planned.append(step) + decisions.append( + StepDecision( + step_name=step.name, + decision=StepDecision.RUN, + reason=("Upstream step triggered rerun for remaining " + "steps"), + )) + previous = step + continue + + prev_state = self._state.steps.get(step.name) + if prev_state is None: + needs_run = True + reason = "No previous state recorded; scheduling step" + elif prev_state.status != "succeeded": + needs_run = True + reason = (f"Previous run status was {prev_state.status}; " + "rerunning step") + elif prev_state.version < step.version: + needs_run = True + reason = ( + f"Step version increased from {prev_state.version} to " + f"{step.version}; rerunning step") + else: + needs_run = False + reason = ("Previous run succeeded with same version; step is " + "up-to-date") + + if not needs_run and previous is not None: + if self._predecessor_newer(previous, step): + needs_run = True + reason = (f"Previous step {previous.name} finished more " + "recently; rerunning downstream steps") + + if needs_run: + planned.append(step) + decisions.append( + StepDecision(step_name=step.name, + decision=StepDecision.RUN, + reason=reason)) + schedule_all_remaining = True else: - needs_run = self._should_run(step) - if not needs_run and previous is not None: - needs_run = self._predecessor_newer(previous, step) - if needs_run: - scheduled.append(step) - schedule_all_remaining = True + decisions.append( + StepDecision(step_name=step.name, + decision=StepDecision.SKIP, + reason=reason)) previous = step - if not scheduled: - logging.info("No steps scheduled.") - return scheduled - def _filter_run_only(self, steps: Sequence[Step], - run_only: str) -> list[Step]: - scoped = [s for s in steps if s.name == run_only] - if not scoped: - raise ValueError(f"run_only step not found: {run_only}") - return scoped + if not planned: + logging.info("No steps scheduled.") + return planned, decisions def _hash_changed(self) -> bool: if not self._critical_input_hash: @@ -382,16 +477,6 @@ def _hash_changed(self) -> bool: return True return previous != self._critical_input_hash - def _should_run(self, step: Step) -> bool: - prev = self._state.steps.get(step.name) - if prev is None: - return True - if prev.status != "succeeded": - return True - if prev.version < step.version: - return True - return False - def _predecessor_newer(self, prev_step: Step, step: Step) -> bool: prev_state = self._state.steps.get(prev_step.name) curr_state = self._state.steps.get(step.name) @@ -418,6 +503,12 @@ def build_steps(config: PipelineConfig) -> list[Step]: ] +def _log_step_decisions(decisions: Sequence[StepDecision]) -> None: + for decision in decisions: + logging.info("step=%s decision=%s reason=%s", decision.step_name, + decision.decision, decision.reason) + + def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, @@ -428,7 +519,9 @@ def build_sdmx_pipeline(*, state=state, steps=builder_steps, critical_input_hash=critical_input_hash) - return builder.build() + result = builder.build() + _log_step_decisions(result.decisions) + return result.pipeline def _sanitize_run_id(dataflow: str) -> str: diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 6e0f77ec1c..326ef63336 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -45,7 +45,7 @@ ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, - build_pipeline_callback, build_sdmx_pipeline, build_steps, + StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, run_sdmx_pipeline) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -333,7 +333,8 @@ def _names_from_builder(self, builder = PipelineBuilder(config=cfg, state=state or self._empty_state(), steps=builder_steps) - pipeline = builder.build() + result = builder.build() + pipeline = result.pipeline return [step.name for step in pipeline.get_steps()] def test_run_only_step(self) -> None: @@ -382,6 +383,18 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: "create-dc-config", ]) + def test_force_branch_records_decisions(self) -> None: + cfg = PipelineConfig(command=_TEST_COMMAND, force=True) + steps = build_steps(cfg) + builder = PipelineBuilder(config=cfg, + state=self._empty_state(), + steps=steps) + result = builder.build() + self.assertEqual(len(result.decisions), len(steps)) + for decision in result.decisions: + self.assertEqual(decision.decision, StepDecision.RUN) + self.assertIn("Force flag set", decision.reason) + def test_run_only_ignores_timestamp_chaining(self) -> None: newer = 4_000 older = 3_000 @@ -412,6 +425,25 @@ def test_version_bump_schedules_downstream(self) -> None: self.assertEqual([s.name for s in pipeline.get_steps()], ["process-full-data", "create-dc-config"]) + def test_incremental_records_skip_reasons(self) -> None: + state = self._state_with({ + "download-data": (1, "succeeded", 1_000), + "download-metadata": (1, "succeeded", 1_000), + "create-sample": (1, "succeeded", 1_000), + "create-schema-mapping": (1, "succeeded", 1_000), + "process-full-data": (1, "succeeded", 1_000), + "create-dc-config": (1, "succeeded", 1_000), + }) + cfg = PipelineConfig(command=_TEST_COMMAND) + steps = build_steps(cfg) + builder = PipelineBuilder(config=cfg, state=state, steps=steps) + result = builder.build() + self.assertFalse(result.pipeline.get_steps()) + self.assertEqual(len(result.decisions), len(steps)) + for decision in result.decisions: + self.assertEqual(decision.decision, StepDecision.SKIP) + self.assertIn("up-to-date", decision.reason) + class RunPipelineTest(unittest.TestCase): From 7400c0d88177a817c330f0572623577ec3b1a312 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 25 Nov 2025 10:00:13 +0000 Subject: [PATCH 19/54] feat: implement metadata download step --- tools/agentic_import/sdmx_import_pipeline.py | 54 ++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 8f419dbf3c..4291663260 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -21,6 +21,7 @@ import os import re import shlex +import subprocess import sys from dataclasses import dataclass from datetime import datetime, timezone @@ -33,6 +34,8 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) +SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" + from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, PipelineAbort, PipelineCallback, PipelineRunner, RunnerConfig, Step) @@ -42,6 +45,20 @@ FLAGS = flags.FLAGS +def _require_config_field(value: str | None, field: str, + step_name: str) -> str: + if value: + return value + raise ValueError(f"{step_name} requires config.{field}") + + +def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: + command = [sys.executable, str(SDMX_CLI_PATH), *args] + if verbose: + logging.debug(f"Running SDMX CLI command: {' '.join(command)}") + subprocess.run(command, check=True) + + def _define_flags() -> None: flags.DEFINE_string("endpoint", None, "SDMX service endpoint.") flags.mark_flag_as_required("endpoint") @@ -268,12 +285,43 @@ def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) def run(self) -> None: - logging.info( - f"{self.name}: no-op implementation for VERSION={self.VERSION}") + endpoint = _require_config_field(self._config.endpoint, "endpoint", + self.name) + agency = _require_config_field(self._config.agency, "agency", + self.name) + dataflow = _require_config_field(self._config.dataflow, "dataflow", + self.name) + dataset_prefix = _resolve_dataset_prefix(self._config) + working_dir = _resolve_working_dir(self._config) + output_path = working_dir / f"{dataset_prefix}_metadata.xml" + if self._config.verbose: + logging.info( + f"Starting SDMX metadata download: endpoint={endpoint} " + f"agency={agency} dataflow={dataflow} -> {output_path}") + else: + logging.info(f"Downloading SDMX metadata to {output_path}") + args = [ + "download-metadata", + f"--endpoint={endpoint}", + f"--agency={agency}", + f"--dataflow={dataflow}", + f"--output_path={output_path}", + ] + if self._config.verbose: + args.append("--verbose") + _run_sdmx_cli(args, verbose=self._config.verbose) def dry_run(self) -> None: + dataset_prefix = _resolve_dataset_prefix(self._config) + working_dir = Path(self._config.working_dir + or os.getcwd()).resolve() + output_path = working_dir / f"{dataset_prefix}_metadata.xml" + endpoint = self._config.endpoint or "" + agency = self._config.agency or "" + dataflow = self._config.dataflow or "" logging.info( - f"{self.name} (dry run): previewing metadata download inputs") + f"{self.name} (dry run): would fetch endpoint={endpoint} " + f"agency={agency} dataflow={dataflow} -> {output_path}") class CreateSampleStep(SdmxStep): From 3ea7f21aaca0b7cbcaaa8acb215b1b37fc1ec597 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Tue, 25 Nov 2025 10:48:25 +0000 Subject: [PATCH 20/54] Refactor SDMX pipeline to use shared subprocess wrapper and cached command preparation --- tools/agentic_import/sdmx_import_pipeline.py | 62 ++++++++------- .../sdmx_import_pipeline_test.py | 75 ++++++++++++++++++- 2 files changed, 111 insertions(+), 26 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 4291663260..f1f6cb90e5 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -23,7 +23,7 @@ import shlex import subprocess import sys -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Callable, ClassVar, Sequence @@ -45,20 +45,30 @@ FLAGS = flags.FLAGS -def _require_config_field(value: str | None, field: str, - step_name: str) -> str: +def _require_config_field(value: str | None, field: str, step_name: str) -> str: if value: return value raise ValueError(f"{step_name} requires config.{field}") -def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: - command = [sys.executable, str(SDMX_CLI_PATH), *args] +@dataclass(frozen=True) +class CommandPlan: + """Holds a constructed command and its expected output path.""" + full_command: list[str] + output_path: Path + + +def _run_command(command: Sequence[str], *, verbose: bool) -> None: if verbose: - logging.debug(f"Running SDMX CLI command: {' '.join(command)}") + logging.debug(f"Running command: {' '.join(command)}") subprocess.run(command, check=True) +def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: + command = [sys.executable, str(SDMX_CLI_PATH), *args] + _run_command(command, verbose=verbose) + + def _define_flags() -> None: flags.DEFINE_string("endpoint", None, "SDMX service endpoint.") flags.mark_flag_as_required("endpoint") @@ -283,23 +293,19 @@ class DownloadMetadataStep(SdmxStep): def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) + self._plan: CommandPlan | None = None - def run(self) -> None: + def _prepare_command(self) -> CommandPlan: + if self._plan: + return self._plan endpoint = _require_config_field(self._config.endpoint, "endpoint", self.name) - agency = _require_config_field(self._config.agency, "agency", - self.name) + agency = _require_config_field(self._config.agency, "agency", self.name) dataflow = _require_config_field(self._config.dataflow, "dataflow", self.name) dataset_prefix = _resolve_dataset_prefix(self._config) working_dir = _resolve_working_dir(self._config) output_path = working_dir / f"{dataset_prefix}_metadata.xml" - if self._config.verbose: - logging.info( - f"Starting SDMX metadata download: endpoint={endpoint} " - f"agency={agency} dataflow={dataflow} -> {output_path}") - else: - logging.info(f"Downloading SDMX metadata to {output_path}") args = [ "download-metadata", f"--endpoint={endpoint}", @@ -309,19 +315,25 @@ def run(self) -> None: ] if self._config.verbose: args.append("--verbose") - _run_sdmx_cli(args, verbose=self._config.verbose) + full_command = [sys.executable, str(SDMX_CLI_PATH)] + args + self._plan = CommandPlan(full_command=full_command, + output_path=output_path) + return self._plan + + def run(self) -> None: + plan = self._prepare_command() + if self._config.verbose: + logging.info( + f"Starting SDMX metadata download: {' '.join(plan.full_command)} -> {plan.output_path}" + ) + else: + logging.info(f"Downloading SDMX metadata to {plan.output_path}") + _run_command(plan.full_command, verbose=self._config.verbose) def dry_run(self) -> None: - dataset_prefix = _resolve_dataset_prefix(self._config) - working_dir = Path(self._config.working_dir - or os.getcwd()).resolve() - output_path = working_dir / f"{dataset_prefix}_metadata.xml" - endpoint = self._config.endpoint or "" - agency = self._config.agency or "" - dataflow = self._config.dataflow or "" + plan = self._prepare_command() logging.info( - f"{self.name} (dry run): would fetch endpoint={endpoint} " - f"agency={agency} dataflow={dataflow} -> {output_path}") + f"{self.name} (dry run): would run {' '.join(plan.full_command)}") class CreateSampleStep(SdmxStep): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 326ef63336..88457391de 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -46,7 +46,7 @@ from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, - run_sdmx_pipeline) + run_sdmx_pipeline, DownloadMetadataStep, _run_command) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -463,6 +463,11 @@ def setUp(self) -> None: self._tmpdir_obj = tempfile.TemporaryDirectory() self.addCleanup(self._tmpdir_obj.cleanup) self._tmpdir = self._tmpdir_obj.name + # Mock _run_command to avoid actual execution during pipeline tests + self._run_command_patcher = mock.patch( + "tools.agentic_import.sdmx_import_pipeline._run_command") + self._mock_run_command = self._run_command_patcher.start() + self.addCleanup(self._run_command_patcher.stop) def test_run_pipeline_updates_state_and_hash(self) -> None: command = "sdmx run pipeline" @@ -579,5 +584,73 @@ def test_hash_unchanged_skips_rerun(self) -> None: self.assertEqual(first_state, second_state) +class SdmxStepTest(unittest.TestCase): + + def setUp(self) -> None: + self._tmpdir_obj = tempfile.TemporaryDirectory() + self.addCleanup(self._tmpdir_obj.cleanup) + self._tmpdir = self._tmpdir_obj.name + + def test_run_command_logs_and_executes(self) -> None: + with mock.patch("subprocess.run") as mock_run: + with self.assertLogs(logging.get_absl_logger(), + level="DEBUG") as logs: + _run_command(["echo", "hello"], verbose=True) + + mock_run.assert_called_once_with(["echo", "hello"], check=True) + self.assertTrue( + any("Running command: echo hello" in entry + for entry in logs.output)) + + def test_download_metadata_step_caches_plan(self) -> None: + config = PipelineConfig(command="test", + endpoint="https://example.com", + agency="AGENCY", + dataflow="FLOW", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True) + step = DownloadMetadataStep(name="test-step", config=config) + + # First call creates plan + plan1 = step._prepare_command() + self.assertIn("download-metadata", plan1.full_command) + self.assertIn("--endpoint=https://example.com", plan1.full_command) + + # Second call returns same object + plan2 = step._prepare_command() + self.assertIs(plan1, plan2) + + def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: + config = PipelineConfig(command="test", + endpoint="https://example.com", + agency="AGENCY", + dataflow="FLOW", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True) + step = DownloadMetadataStep(name="test-step", config=config) + + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + step.dry_run() + step.run() + + # Verify dry_run logged the command + self.assertTrue( + any("test-step (dry run): would run" in entry + for entry in logs.output)) + self.assertTrue( + any("download-metadata" in entry for entry in logs.output)) + + # Verify run called the command with the same args + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + self.assertIn("download-metadata", args[0]) + self.assertTrue(kwargs["verbose"]) + + if __name__ == "__main__": unittest.main() From c1936bd16c55a9881a95f89b38261587436b451b Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Tue, 25 Nov 2025 11:12:27 +0000 Subject: [PATCH 21/54] Implement DownloadDataStep in sdmx_import_pipeline --- tools/agentic_import/sdmx_import_pipeline.py | 44 ++++++++++++++- .../sdmx_import_pipeline_test.py | 55 ++++++++++++++++++- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index f1f6cb90e5..d259a292d9 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -277,13 +277,51 @@ class DownloadDataStep(SdmxStep): def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) + self._plan: CommandPlan | None = None + + def _prepare_command(self) -> CommandPlan: + if self._plan: + return self._plan + endpoint = _require_config_field(self._config.endpoint, "endpoint", + self.name) + agency = _require_config_field(self._config.agency, "agency", self.name) + dataflow = _require_config_field(self._config.dataflow, "dataflow", + self.name) + dataset_prefix = _resolve_dataset_prefix(self._config) + working_dir = _resolve_working_dir(self._config) + output_path = working_dir / f"{dataset_prefix}_data.csv" + args = [ + "download-data", + f"--endpoint={endpoint}", + f"--agency={agency}", + f"--dataflow={dataflow}", + f"--output_path={output_path}", + ] + if self._config.dataflow_key: + args.append(f"--key={self._config.dataflow_key}") + if self._config.dataflow_param: + args.append(f"--param={self._config.dataflow_param}") + if self._config.verbose: + args.append("--verbose") + full_command = [sys.executable, str(SDMX_CLI_PATH)] + args + self._plan = CommandPlan(full_command=full_command, + output_path=output_path) + return self._plan def run(self) -> None: - logging.info( - f"{self.name}: no-op implementation for VERSION={self.VERSION}") + plan = self._prepare_command() + if self._config.verbose: + logging.info( + f"Starting SDMX data download: {' '.join(plan.full_command)} -> {plan.output_path}" + ) + else: + logging.info(f"Downloading SDMX data to {plan.output_path}") + _run_command(plan.full_command, verbose=self._config.verbose) def dry_run(self) -> None: - logging.info(f"{self.name} (dry run): previewing data download inputs") + plan = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(plan.full_command)}") class DownloadMetadataStep(SdmxStep): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 88457391de..6de7f350e9 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -46,7 +46,7 @@ from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, - run_sdmx_pipeline, DownloadMetadataStep, _run_command) + run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, _run_command) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -651,6 +651,59 @@ def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: self.assertIn("download-metadata", args[0]) self.assertTrue(kwargs["verbose"]) + def test_download_data_step_caches_plan(self) -> None: + config = PipelineConfig(command="test", + endpoint="https://example.com", + agency="AGENCY", + dataflow="FLOW", + dataflow_key="test-key", + dataflow_param="area=US", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True) + step = DownloadDataStep(name="test-step", config=config) + + # First call creates plan + plan1 = step._prepare_command() + self.assertIn("download-data", plan1.full_command) + self.assertIn("--endpoint=https://example.com", plan1.full_command) + self.assertIn("--key=test-key", plan1.full_command) + self.assertIn("--param=area=US", plan1.full_command) + + # Second call returns same object + plan2 = step._prepare_command() + self.assertIs(plan1, plan2) + + def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: + config = PipelineConfig(command="test", + endpoint="https://example.com", + agency="AGENCY", + dataflow="FLOW", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True) + step = DownloadDataStep(name="test-step", config=config) + + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + step.dry_run() + step.run() + + # Verify dry_run logged the command + self.assertTrue( + any("test-step (dry run): would run" in entry + for entry in logs.output)) + self.assertTrue( + any("download-data" in entry for entry in logs.output)) + + # Verify run called the command with the same args + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + self.assertIn("download-data", args[0]) + self.assertTrue(kwargs["verbose"]) + if __name__ == "__main__": unittest.main() From db4caa426c2a1209792e93499e059b22454bad4a Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Tue, 25 Nov 2025 16:34:21 +0000 Subject: [PATCH 22/54] Refactor SDMX config resolution and align flag names - Refactored SDMX configuration flags to hierarchical dot-notation (e.g., sdmx.dataflow.id). - Introduced constants for flag names to reduce duplication. - Aligned critical input hash keys with flag names. - Implemented _resolve_config to handle dataset prefix and working dir resolution once. - Updated steps to use pre-resolved configuration. - Fixed tests and lint issues. --- tools/agentic_import/sdmx_import_pipeline.py | 255 ++++++++++++------ .../sdmx_import_pipeline_test.py | 231 ++++++++++++---- 2 files changed, 348 insertions(+), 138 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index d259a292d9..654e2b1278 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -23,6 +23,7 @@ import shlex import subprocess import sys +import dataclasses from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path @@ -35,6 +36,15 @@ sys.path.insert(0, str(REPO_ROOT)) SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" +DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py" + +# Flag names +_FLAG_SDMX_ENDPOINT = "sdmx.endpoint" +_FLAG_SDMX_AGENCY = "sdmx.agency" +_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id" +_FLAG_SDMX_DATAFLOW_KEY = "sdmx.dataflow.key" +_FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param" +_FLAG_SAMPLE_ROWS = "sample.rows" from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, PipelineAbort, PipelineCallback, @@ -70,22 +80,27 @@ def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: def _define_flags() -> None: - flags.DEFINE_string("endpoint", None, "SDMX service endpoint.") - flags.mark_flag_as_required("endpoint") + flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") + flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT) - flags.DEFINE_string("agency", None, "Owning SDMX agency identifier.") - flags.mark_flag_as_required("agency") + flags.DEFINE_string(_FLAG_SDMX_AGENCY, None, + "Owning SDMX agency identifier.") + flags.mark_flag_as_required(_FLAG_SDMX_AGENCY) - flags.DEFINE_string("dataflow", None, "Target SDMX dataflow identifier.") - flags.mark_flag_as_required("dataflow") + flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None, + "Target SDMX dataflow identifier.") + flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID) - flags.DEFINE_string("dataflow_key", None, "Optional SDMX key or filter.") - flags.DEFINE_alias("key", "dataflow_key") + flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None, + "Optional SDMX key or filter.") flags.DEFINE_string( - "dataflow_param", None, + _FLAG_SDMX_DATAFLOW_PARAM, None, "Optional SDMX parameter appended to the dataflow query.") + flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000, + "Number of rows to sample from downloaded data.") + flags.DEFINE_string( "dataset_prefix", None, "Optional dataset prefix to override auto-derived values.") @@ -207,27 +222,47 @@ def build_pipeline_callback( @dataclass(frozen=True) -class PipelineConfig: - """User-configurable inputs that mimic planned CLI flags. +class SdmxDataflowConfig: + """Configuration for SDMX dataflow.""" + id: str | None = None + key: str | None = None + param: str | None = None - This is a lightweight container; CLI parsing will be added in a later - phase. Defaults are intentionally minimal. - """ - command: str +@dataclass(frozen=True) +class SdmxConfig: + """Configuration for SDMX data access.""" endpoint: str | None = None agency: str | None = None - dataflow: str | None = None - dataflow_key: str | None = None - dataflow_param: str | None = None + dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig) + + +@dataclass(frozen=True) +class SampleConfig: + """Configuration for data sampling.""" + rows: int = 1000 + + +@dataclass(frozen=True) +class RunConfig: + """Configuration for pipeline execution.""" + command: str dataset_prefix: str | None = None - working_dir: str | None = None # TODO: Add CLI flag once semantics stabilize. + working_dir: str | None = None run_only: str | None = None force: bool = False verbose: bool = False skip_confirmation: bool = False +@dataclass(frozen=True) +class PipelineConfig: + """Aggregated configuration for the pipeline.""" + sdmx: SdmxConfig = field(default_factory=SdmxConfig) + sample: SampleConfig = field(default_factory=SampleConfig) + run: RunConfig = field(default_factory=lambda: RunConfig(command="python")) + + @dataclass(frozen=True) class StepDecision: """Represents whether a step will run and why.""" @@ -282,13 +317,14 @@ def __init__(self, *, name: str, config: PipelineConfig) -> None: def _prepare_command(self) -> CommandPlan: if self._plan: return self._plan - endpoint = _require_config_field(self._config.endpoint, "endpoint", - self.name) - agency = _require_config_field(self._config.agency, "agency", self.name) - dataflow = _require_config_field(self._config.dataflow, "dataflow", - self.name) - dataset_prefix = _resolve_dataset_prefix(self._config) - working_dir = _resolve_working_dir(self._config) + endpoint = _require_config_field(self._config.sdmx.endpoint, + _FLAG_SDMX_ENDPOINT, self.name) + agency = _require_config_field(self._config.sdmx.agency, + _FLAG_SDMX_AGENCY, self.name) + dataflow = _require_config_field(self._config.sdmx.dataflow.id, + _FLAG_SDMX_DATAFLOW_ID, self.name) + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir) output_path = working_dir / f"{dataset_prefix}_data.csv" args = [ "download-data", @@ -297,11 +333,11 @@ def _prepare_command(self) -> CommandPlan: f"--dataflow={dataflow}", f"--output_path={output_path}", ] - if self._config.dataflow_key: - args.append(f"--key={self._config.dataflow_key}") - if self._config.dataflow_param: - args.append(f"--param={self._config.dataflow_param}") - if self._config.verbose: + if self._config.sdmx.dataflow.key: + args.append(f"--key={self._config.sdmx.dataflow.key}") + if self._config.sdmx.dataflow.param: + args.append(f"--param={self._config.sdmx.dataflow.param}") + if self._config.run.verbose: args.append("--verbose") full_command = [sys.executable, str(SDMX_CLI_PATH)] + args self._plan = CommandPlan(full_command=full_command, @@ -310,13 +346,13 @@ def _prepare_command(self) -> CommandPlan: def run(self) -> None: plan = self._prepare_command() - if self._config.verbose: + if self._config.run.verbose: logging.info( f"Starting SDMX data download: {' '.join(plan.full_command)} -> {plan.output_path}" ) else: logging.info(f"Downloading SDMX data to {plan.output_path}") - _run_command(plan.full_command, verbose=self._config.verbose) + _run_command(plan.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: plan = self._prepare_command() @@ -336,13 +372,14 @@ def __init__(self, *, name: str, config: PipelineConfig) -> None: def _prepare_command(self) -> CommandPlan: if self._plan: return self._plan - endpoint = _require_config_field(self._config.endpoint, "endpoint", - self.name) - agency = _require_config_field(self._config.agency, "agency", self.name) - dataflow = _require_config_field(self._config.dataflow, "dataflow", - self.name) - dataset_prefix = _resolve_dataset_prefix(self._config) - working_dir = _resolve_working_dir(self._config) + endpoint = _require_config_field(self._config.sdmx.endpoint, + _FLAG_SDMX_ENDPOINT, self.name) + agency = _require_config_field(self._config.sdmx.agency, + _FLAG_SDMX_AGENCY, self.name) + dataflow = _require_config_field(self._config.sdmx.dataflow.id, + _FLAG_SDMX_DATAFLOW_ID, self.name) + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir) output_path = working_dir / f"{dataset_prefix}_metadata.xml" args = [ "download-metadata", @@ -351,7 +388,7 @@ def _prepare_command(self) -> CommandPlan: f"--dataflow={dataflow}", f"--output_path={output_path}", ] - if self._config.verbose: + if self._config.run.verbose: args.append("--verbose") full_command = [sys.executable, str(SDMX_CLI_PATH)] + args self._plan = CommandPlan(full_command=full_command, @@ -360,13 +397,13 @@ def _prepare_command(self) -> CommandPlan: def run(self) -> None: plan = self._prepare_command() - if self._config.verbose: + if self._config.run.verbose: logging.info( f"Starting SDMX metadata download: {' '.join(plan.full_command)} -> {plan.output_path}" ) else: logging.info(f"Downloading SDMX metadata to {plan.output_path}") - _run_command(plan.full_command, verbose=self._config.verbose) + _run_command(plan.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: plan = self._prepare_command() @@ -381,13 +418,52 @@ class CreateSampleStep(SdmxStep): def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) + self._plan: CommandPlan | None = None + + def _prepare_command(self) -> CommandPlan: + if self._plan: + return self._plan + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir) + input_path = working_dir / f"{dataset_prefix}_data.csv" + output_path = working_dir / f"{dataset_prefix}_sample.csv" + + # Check input file existence before running, but allow plan creation. + # In a real run, this will fail early if download-data didn't run. + args = [ + f"--sampler_input={input_path}", + f"--sampler_output={output_path}", + f"--sampler_output_rows={self._config.sample.rows}", + ] + full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args + self._plan = CommandPlan(full_command=full_command, + output_path=output_path) + return self._plan def run(self) -> None: - logging.info( - f"{self.name}: no-op implementation for VERSION={self.VERSION}") + plan = self._prepare_command() + # Find input path from command args + input_path_arg = next((arg for arg in plan.full_command + if arg.startswith("--sampler_input=")), None) + if not input_path_arg: + raise RuntimeError("Could not find sampler_input in command") + input_path = Path(input_path_arg.split("=")[1]) + + if not input_path.is_file(): + raise RuntimeError(f"Input file missing for sampling: {input_path}") + + if self._config.run.verbose: + logging.info( + f"Starting data sampling: {' '.join(plan.full_command)} -> {plan.output_path}" + ) + else: + logging.info(f"Sampling data to {plan.output_path}") + _run_command(plan.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: - logging.info(f"{self.name} (dry run): previewing sample generation") + plan = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(plan.full_command)}") class CreateSchemaMapStep(SdmxStep): @@ -453,9 +529,9 @@ def __init__(self, self._critical_input_hash = critical_input_hash def build(self) -> BuildResult: - if self._config.run_only: - planned, decisions = self._plan_run_only(self._config.run_only) - elif self._config.force: + if self._config.run.run_only: + planned, decisions = self._plan_run_only(self._config.run.run_only) + elif self._config.run.force: logging.info("Force flag set; scheduling all SDMX steps") planned, decisions = self._plan_all_steps( "Force flag set; scheduling this step") @@ -630,12 +706,13 @@ def _sanitize_run_id(dataflow: str) -> str: def _resolve_dataset_prefix(config: PipelineConfig) -> str: - if config.dataset_prefix: - return config.dataset_prefix - if not config.dataflow: + if config.run.dataset_prefix: + return config.run.dataset_prefix + if not config.sdmx.dataflow.id: raise ValueError( - "dataflow or dataset_prefix is required to derive dataset prefix") - sanitized = _sanitize_run_id(config.dataflow) + "dataflow.id or dataset_prefix is required to derive dataset prefix" + ) + sanitized = _sanitize_run_id(config.sdmx.dataflow.id) if not sanitized: raise ValueError("dataflow value is invalid after sanitization") return sanitized @@ -643,18 +720,18 @@ def _resolve_dataset_prefix(config: PipelineConfig) -> str: def _compute_critical_input_hash(config: PipelineConfig) -> str: payload = { - "agency": config.agency, - "dataflow": config.dataflow, - "endpoint": config.endpoint, - "dataflow_key": config.dataflow_key, - "dataflow_param": config.dataflow_param, + _FLAG_SDMX_AGENCY: config.sdmx.agency, + _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id, + _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint, + _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key, + _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param, } serialized = json.dumps(payload, sort_keys=True, separators=(",", ":")) return hashlib.sha256(serialized.encode("utf-8")).hexdigest() def _resolve_working_dir(config: PipelineConfig) -> Path: - directory = Path(config.working_dir or os.getcwd()) + directory = Path(config.run.working_dir or os.getcwd()) if directory.exists(): if not directory.is_dir(): raise ValueError(f"working_dir is not a directory: {directory}") @@ -663,14 +740,25 @@ def _resolve_working_dir(config: PipelineConfig) -> Path: return directory +def _resolve_config(config: PipelineConfig) -> PipelineConfig: + """Resolves dynamic configuration values and returns a new config.""" + dataset_prefix = _resolve_dataset_prefix(config) + working_dir = _resolve_working_dir(config) + new_run = dataclasses.replace(config.run, + dataset_prefix=dataset_prefix, + working_dir=str(working_dir)) + return dataclasses.replace(config, run=new_run) + + def run_sdmx_pipeline( *, config: PipelineConfig, now_fn: Callable[[], datetime] | None = None, ) -> None: """Orchestrates the SDMX pipeline for the provided configuration.""" - working_dir = _resolve_working_dir(config) - dataset_prefix = _resolve_dataset_prefix(config) + resolved_config = _resolve_config(config) + working_dir = Path(resolved_config.run.working_dir) + dataset_prefix = resolved_config.run.dataset_prefix state_handler = StateHandler( state_path=working_dir / ".datacommons" / f"{dataset_prefix}.state.json", @@ -679,19 +767,19 @@ def run_sdmx_pipeline( state = state_handler.get_state() # Snapshot state for planning so callback mutations do not affect scheduling. state_snapshot = copy.deepcopy(state) - critical_hash = _compute_critical_input_hash(config) - pipeline = build_sdmx_pipeline(config=config, + critical_hash = _compute_critical_input_hash(resolved_config) + pipeline = build_sdmx_pipeline(config=resolved_config, state=state_snapshot, critical_input_hash=critical_hash) callback = build_pipeline_callback( state_handler=state_handler, dataset_prefix=dataset_prefix, critical_input_hash=critical_hash, - command=config.command, - skip_confirmation=config.skip_confirmation, + command=resolved_config.run.command, + skip_confirmation=resolved_config.run.skip_confirmation, now_fn=now_fn, ) - if config.verbose: + if resolved_config.run.verbose: logging.set_verbosity(logging.DEBUG) runner = PipelineRunner(RunnerConfig()) runner.run(pipeline, callback) @@ -699,20 +787,29 @@ def run_sdmx_pipeline( def prepare_config() -> PipelineConfig: """Builds PipelineConfig from CLI flags.""" + # absl.flags doesn't support dots in attribute access easily, + # so we access the flag values directly from the flag names. command = shlex.join(sys.argv) if sys.argv else "python" return PipelineConfig( - command=command, - endpoint=FLAGS.endpoint, - agency=FLAGS.agency, - dataflow=FLAGS.dataflow, - dataflow_key=FLAGS.dataflow_key, - dataflow_param=FLAGS.dataflow_param, - dataset_prefix=FLAGS.dataset_prefix, - working_dir=None, - run_only=FLAGS.run_only, - force=FLAGS.force, - verbose=FLAGS.verbose, - skip_confirmation=FLAGS.skip_confirmation, + sdmx=SdmxConfig( + endpoint=FLAGS[_FLAG_SDMX_ENDPOINT].value, + agency=FLAGS[_FLAG_SDMX_AGENCY].value, + dataflow=SdmxDataflowConfig( + id=FLAGS[_FLAG_SDMX_DATAFLOW_ID].value, + key=FLAGS[_FLAG_SDMX_DATAFLOW_KEY].value, + param=FLAGS[_FLAG_SDMX_DATAFLOW_PARAM].value, + ), + ), + sample=SampleConfig(rows=FLAGS[_FLAG_SAMPLE_ROWS].value,), + run=RunConfig( + command=command, + dataset_prefix=FLAGS.dataset_prefix, + working_dir=None, + run_only=FLAGS.run_only, + force=FLAGS.force, + verbose=FLAGS.verbose, + skip_confirmation=FLAGS.skip_confirmation, + ), ) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 6de7f350e9..feae900e6d 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -46,7 +46,8 @@ from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, - run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, _run_command) + run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep, + _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -338,20 +339,23 @@ def _names_from_builder(self, return [step.name for step in pipeline.get_steps()] def test_run_only_step(self) -> None: - cfg_step = PipelineConfig(command=_TEST_COMMAND, - run_only="download-data") + cfg_step = PipelineConfig( + run=RunConfig(command=_TEST_COMMAND, run_only="download-data")) names_step = self._names_from_builder(cfg_step) self.assertEqual(names_step, ["download-data"]) with self.assertRaisesRegex(ValueError, "run_only step not found"): self._names_from_builder( - PipelineConfig(command=_TEST_COMMAND, run_only="nope")) + PipelineConfig( + run=RunConfig(command=_TEST_COMMAND, run_only="nope"))) with self.assertRaisesRegex(ValueError, "run_only step not found"): self._names_from_builder( - PipelineConfig(command=_TEST_COMMAND, run_only="download.nope")) + PipelineConfig(run=RunConfig(command=_TEST_COMMAND, + run_only="download.nope"))) def test_force_semantics(self) -> None: - cfg_all = PipelineConfig(command=_TEST_COMMAND, force=True) + cfg_all = PipelineConfig( + run=RunConfig(command=_TEST_COMMAND, force=True)) names_all = self._names_from_builder(cfg_all) self.assertEqual(names_all, [ "download-data", @@ -373,7 +377,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: "process-full-data": (1, "succeeded", older), "create-dc-config": (1, "succeeded", older), }) - cfg = PipelineConfig(command=_TEST_COMMAND) + cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND)) names = self._names_from_builder(cfg, state=state) self.assertEqual(names, [ "download-metadata", @@ -384,7 +388,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: ]) def test_force_branch_records_decisions(self) -> None: - cfg = PipelineConfig(command=_TEST_COMMAND, force=True) + cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND, force=True)) steps = build_steps(cfg) builder = PipelineBuilder(config=cfg, state=self._empty_state(), @@ -402,7 +406,8 @@ def test_run_only_ignores_timestamp_chaining(self) -> None: "download-data": (1, "succeeded", newer), "download-metadata": (1, "succeeded", older), }) - cfg = PipelineConfig(command=_TEST_COMMAND, run_only="download-data") + cfg = PipelineConfig( + run=RunConfig(command=_TEST_COMMAND, run_only="download-data")) names = self._names_from_builder(cfg, state=state) self.assertEqual(names, ["download-data"]) @@ -417,7 +422,7 @@ def test_version_bump_schedules_downstream(self) -> None: "process-full-data": (1, "succeeded", 1000), "create-dc-config": (1, "succeeded", 1000), }) - cfg = PipelineConfig(command=_TEST_COMMAND) + cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND)) names = self._names_from_builder(cfg, steps, state) self.assertEqual(names, ["process-full-data", "create-dc-config"]) @@ -434,7 +439,7 @@ def test_incremental_records_skip_reasons(self) -> None: "process-full-data": (1, "succeeded", 1_000), "create-dc-config": (1, "succeeded", 1_000), }) - cfg = PipelineConfig(command=_TEST_COMMAND) + cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND)) steps = build_steps(cfg) builder = PipelineBuilder(config=cfg, state=state, steps=steps) result = builder.build() @@ -449,15 +454,23 @@ class RunPipelineTest(unittest.TestCase): def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None, command: str) -> PipelineConfig: - return PipelineConfig(endpoint="https://api.example.com", - agency="TEST_AGENCY", - dataflow=dataflow, - dataflow_key="test-key", - dataflow_param="area=US", - dataset_prefix=dataset_prefix, - working_dir=self._tmpdir, - skip_confirmation=True, - command=command) + return PipelineConfig( + sdmx=SdmxConfig( + endpoint="https://api.example.com", + agency="TEST_AGENCY", + dataflow=SdmxDataflowConfig( + id=dataflow, + key="test-key", + param="area=US", + ), + ), + run=RunConfig( + dataset_prefix=dataset_prefix, + working_dir=self._tmpdir, + skip_confirmation=True, + command=command, + ), + ) def setUp(self) -> None: self._tmpdir_obj = tempfile.TemporaryDirectory() @@ -477,6 +490,9 @@ def test_run_pipeline_updates_state_and_hash(self) -> None: clock = _IncrementingClock(datetime(2025, 1, 2, tzinfo=timezone.utc), timedelta(seconds=2)) + # Create dummy input file for sampling + (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") + run_sdmx_pipeline(config=config, now_fn=clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" @@ -487,11 +503,11 @@ def test_run_pipeline_updates_state_and_hash(self) -> None: expected_hash = hashlib.sha256( json.dumps( { - "agency": config.agency, - "dataflow": config.dataflow, - "endpoint": config.endpoint, - "dataflow_key": config.dataflow_key, - "dataflow_param": config.dataflow_param, + "sdmx.agency": config.sdmx.agency, + "sdmx.dataflow.id": config.sdmx.dataflow.id, + "sdmx.endpoint": config.sdmx.endpoint, + "sdmx.dataflow.key": config.sdmx.dataflow.key, + "sdmx.dataflow.param": config.sdmx.dataflow.param, }, sort_keys=True, separators=(",", ":")).encode("utf-8")).hexdigest() @@ -513,6 +529,9 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: config = self._build_config(dataset_prefix=None, dataflow=dataflow, command="sdmx run sanitized") + # Create dummy input file for sampling (sanitized name) + (Path(self._tmpdir) / + "my_flow_name_2025_data.csv").write_text("header\nrow1") run_sdmx_pipeline(config=config, now_fn=_IncrementingClock( datetime(2025, 1, 3, tzinfo=timezone.utc), @@ -529,9 +548,12 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: def test_invalid_working_dir_raises(self) -> None: path = Path(self._tmpdir) / "not_a_dir" path.write_text("content") - config = dataclasses.replace(self._build_config( - dataset_prefix="demo", dataflow="df", command="sdmx run invalid"), - working_dir=str(path)) + base_config = self._build_config(dataset_prefix="demo", + dataflow="df", + command="sdmx run invalid") + updated_run = dataclasses.replace(base_config.run, + working_dir=str(path)) + config = dataclasses.replace(base_config, run=updated_run) with self.assertRaisesRegex(ValueError, "working_dir is not a directory"): run_sdmx_pipeline(config=config) @@ -542,13 +564,19 @@ def test_hash_change_forces_full_rerun(self) -> None: command="sdmx rerun force") first_clock = _IncrementingClock( datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1)) + # Create dummy input file for sampling + (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") run_sdmx_pipeline(config=config, now_fn=first_clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" with state_path.open(encoding="utf-8") as fp: first_state = json.load(fp) - updated_config = dataclasses.replace(config, dataflow_key="changed-key") + updated_dataflow = dataclasses.replace(config.sdmx.dataflow, + key="changed-key") + updated_sdmx = dataclasses.replace(config.sdmx, + dataflow=updated_dataflow) + updated_config = dataclasses.replace(config, sdmx=updated_sdmx) second_clock = _IncrementingClock( datetime(2025, 1, 5, tzinfo=timezone.utc), timedelta(seconds=1)) run_sdmx_pipeline(config=updated_config, now_fn=second_clock) @@ -568,6 +596,8 @@ def test_hash_unchanged_skips_rerun(self) -> None: command="sdmx rerun noop") initial_clock = _IncrementingClock( datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1)) + # Create dummy input file for sampling + (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") run_sdmx_pipeline(config=config, now_fn=initial_clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" @@ -603,13 +633,19 @@ def test_run_command_logs_and_executes(self) -> None: for entry in logs.output)) def test_download_metadata_step_caches_plan(self) -> None: - config = PipelineConfig(command="test", - endpoint="https://example.com", - agency="AGENCY", - dataflow="FLOW", - dataset_prefix="demo", - working_dir=self._tmpdir, - verbose=True) + config = PipelineConfig( + sdmx=SdmxConfig( + endpoint="https://example.com", + agency="AGENCY", + dataflow=SdmxDataflowConfig(id="FLOW"), + ), + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + ) step = DownloadMetadataStep(name="test-step", config=config) # First call creates plan @@ -622,13 +658,19 @@ def test_download_metadata_step_caches_plan(self) -> None: self.assertIs(plan1, plan2) def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: - config = PipelineConfig(command="test", - endpoint="https://example.com", - agency="AGENCY", - dataflow="FLOW", - dataset_prefix="demo", - working_dir=self._tmpdir, - verbose=True) + config = PipelineConfig( + sdmx=SdmxConfig( + endpoint="https://example.com", + agency="AGENCY", + dataflow=SdmxDataflowConfig(id="FLOW"), + ), + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + ) step = DownloadMetadataStep(name="test-step", config=config) with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" @@ -652,15 +694,23 @@ def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: self.assertTrue(kwargs["verbose"]) def test_download_data_step_caches_plan(self) -> None: - config = PipelineConfig(command="test", - endpoint="https://example.com", - agency="AGENCY", - dataflow="FLOW", - dataflow_key="test-key", - dataflow_param="area=US", - dataset_prefix="demo", - working_dir=self._tmpdir, - verbose=True) + config = PipelineConfig( + sdmx=SdmxConfig( + endpoint="https://example.com", + agency="AGENCY", + dataflow=SdmxDataflowConfig( + id="FLOW", + key="test-key", + param="area=US", + ), + ), + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + ) step = DownloadDataStep(name="test-step", config=config) # First call creates plan @@ -675,13 +725,19 @@ def test_download_data_step_caches_plan(self) -> None: self.assertIs(plan1, plan2) def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: - config = PipelineConfig(command="test", - endpoint="https://example.com", - agency="AGENCY", - dataflow="FLOW", - dataset_prefix="demo", - working_dir=self._tmpdir, - verbose=True) + config = PipelineConfig( + sdmx=SdmxConfig( + endpoint="https://example.com", + agency="AGENCY", + dataflow=SdmxDataflowConfig(id="FLOW"), + ), + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + ) step = DownloadDataStep(name="test-step", config=config) with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" @@ -704,6 +760,63 @@ def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: self.assertIn("download-data", args[0]) self.assertTrue(kwargs["verbose"]) + def test_create_sample_step_caches_plan(self) -> None: + config = PipelineConfig( + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + sample=SampleConfig(rows=500), + ) + step = CreateSampleStep(name="test-step", config=config) + + # First call creates plan + plan1 = step._prepare_command() + self.assertIn("data_sampler.py", plan1.full_command[1]) + self.assertIn("--sampler_output_rows=500", plan1.full_command) + + # Second call returns same object + plan2 = step._prepare_command() + self.assertIs(plan1, plan2) + + def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: + config = PipelineConfig( + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + sample=SampleConfig(rows=500), + ) + step = CreateSampleStep(name="test-step", config=config) + + # Create dummy input file + input_path = Path(self._tmpdir) / "demo_data.csv" + input_path.write_text("header\nrow1") + + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + step.dry_run() + step.run() + + # Verify dry_run logged the command + self.assertTrue( + any("test-step (dry run): would run" in entry + for entry in logs.output)) + self.assertTrue( + any("data_sampler.py" in entry for entry in logs.output)) + + # Verify run called the command with the same args + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + self.assertIn("data_sampler.py", args[0][1]) + self.assertTrue(kwargs["verbose"]) + if __name__ == "__main__": unittest.main() From fc7c2e4e4100b57a5bf1540dc5ebae9056a70407 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Tue, 25 Nov 2025 17:33:04 +0000 Subject: [PATCH 23/54] feat: Add early input file existence check to `CreateSampleStep` and refactor command preparation using `_StepContext` dataclass, with corresponding test updates. --- tools/agentic_import/sdmx_import_pipeline.py | 49 +++++++++---------- .../sdmx_import_pipeline_test.py | 21 ++++++++ 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 654e2b1278..2ffe77df8b 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -416,54 +416,53 @@ class CreateSampleStep(SdmxStep): VERSION = 1 + @dataclass(frozen=True) + class _StepContext: + input_path: Path + full_command: list[str] + output_path: Path + def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) - self._plan: CommandPlan | None = None + self._context: CreateSampleStep._StepContext | None = None - def _prepare_command(self) -> CommandPlan: - if self._plan: - return self._plan + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context dataset_prefix = self._config.run.dataset_prefix working_dir = Path(self._config.run.working_dir) input_path = working_dir / f"{dataset_prefix}_data.csv" output_path = working_dir / f"{dataset_prefix}_sample.csv" - # Check input file existence before running, but allow plan creation. - # In a real run, this will fail early if download-data didn't run. + if not input_path.is_file(): + raise RuntimeError(f"Input file missing for sampling: {input_path}") + args = [ f"--sampler_input={input_path}", f"--sampler_output={output_path}", f"--sampler_output_rows={self._config.sample.rows}", ] full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args - self._plan = CommandPlan(full_command=full_command, - output_path=output_path) - return self._plan + self._context = CreateSampleStep._StepContext(input_path=input_path, + full_command=full_command, + output_path=output_path) + return self._context def run(self) -> None: - plan = self._prepare_command() - # Find input path from command args - input_path_arg = next((arg for arg in plan.full_command - if arg.startswith("--sampler_input=")), None) - if not input_path_arg: - raise RuntimeError("Could not find sampler_input in command") - input_path = Path(input_path_arg.split("=")[1]) - - if not input_path.is_file(): - raise RuntimeError(f"Input file missing for sampling: {input_path}") - + context = self._prepare_command() if self._config.run.verbose: logging.info( - f"Starting data sampling: {' '.join(plan.full_command)} -> {plan.output_path}" + f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}" ) else: - logging.info(f"Sampling data to {plan.output_path}") - _run_command(plan.full_command, verbose=self._config.run.verbose) + logging.info(f"Sampling data to {context.output_path}") + _run_command(context.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: - plan = self._prepare_command() + context = self._prepare_command() logging.info( - f"{self.name} (dry run): would run {' '.join(plan.full_command)}") + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) class CreateSchemaMapStep(SdmxStep): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index feae900e6d..a1be47e6c2 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -772,6 +772,10 @@ def test_create_sample_step_caches_plan(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) + # Create dummy input file to satisfy validation + input_path = Path(self._tmpdir) / "demo_data.csv" + input_path.write_text("header\nrow1") + # First call creates plan plan1 = step._prepare_command() self.assertIn("data_sampler.py", plan1.full_command[1]) @@ -817,6 +821,23 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: self.assertIn("data_sampler.py", args[0][1]) self.assertTrue(kwargs["verbose"]) + def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None: + config = PipelineConfig( + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + sample=SampleConfig(rows=500), + ) + step = CreateSampleStep(name="test-step", config=config) + # No input file created + + with self.assertRaisesRegex(RuntimeError, + "Input file missing for sampling"): + step.dry_run() + if __name__ == "__main__": unittest.main() From e7c8db537de8e00c950e822da649d8f1a6debc53 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Tue, 25 Nov 2025 17:37:45 +0000 Subject: [PATCH 24/54] refactor: Replace `CommandPlan` dataclass with a nested `_StepContext` and update all related references in pipeline steps and tests. --- tools/agentic_import/sdmx_import_pipeline.py | 71 ++++++++++--------- .../sdmx_import_pipeline_test.py | 40 +++++------ 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 2ffe77df8b..8abac73aaa 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -61,13 +61,6 @@ def _require_config_field(value: str | None, field: str, step_name: str) -> str: raise ValueError(f"{step_name} requires config.{field}") -@dataclass(frozen=True) -class CommandPlan: - """Holds a constructed command and its expected output path.""" - full_command: list[str] - output_path: Path - - def _run_command(command: Sequence[str], *, verbose: bool) -> None: if verbose: logging.debug(f"Running command: {' '.join(command)}") @@ -310,13 +303,18 @@ class DownloadDataStep(SdmxStep): VERSION = 1 + @dataclass(frozen=True) + class _StepContext: + full_command: list[str] + output_path: Path + def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) - self._plan: CommandPlan | None = None + self._context: DownloadDataStep._StepContext | None = None - def _prepare_command(self) -> CommandPlan: - if self._plan: - return self._plan + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context endpoint = _require_config_field(self._config.sdmx.endpoint, _FLAG_SDMX_ENDPOINT, self.name) agency = _require_config_field(self._config.sdmx.agency, @@ -340,24 +338,25 @@ def _prepare_command(self) -> CommandPlan: if self._config.run.verbose: args.append("--verbose") full_command = [sys.executable, str(SDMX_CLI_PATH)] + args - self._plan = CommandPlan(full_command=full_command, - output_path=output_path) - return self._plan + self._context = DownloadDataStep._StepContext(full_command=full_command, + output_path=output_path) + return self._context def run(self) -> None: - plan = self._prepare_command() + context = self._prepare_command() if self._config.run.verbose: logging.info( - f"Starting SDMX data download: {' '.join(plan.full_command)} -> {plan.output_path}" + f"Starting SDMX data download: {' '.join(context.full_command)} -> {context.output_path}" ) else: - logging.info(f"Downloading SDMX data to {plan.output_path}") - _run_command(plan.full_command, verbose=self._config.run.verbose) + logging.info(f"Downloading SDMX data to {context.output_path}") + _run_command(context.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: - plan = self._prepare_command() + context = self._prepare_command() logging.info( - f"{self.name} (dry run): would run {' '.join(plan.full_command)}") + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) class DownloadMetadataStep(SdmxStep): @@ -365,13 +364,18 @@ class DownloadMetadataStep(SdmxStep): VERSION = 1 + @dataclass(frozen=True) + class _StepContext: + full_command: list[str] + output_path: Path + def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) - self._plan: CommandPlan | None = None + self._context: DownloadMetadataStep._StepContext | None = None - def _prepare_command(self) -> CommandPlan: - if self._plan: - return self._plan + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context endpoint = _require_config_field(self._config.sdmx.endpoint, _FLAG_SDMX_ENDPOINT, self.name) agency = _require_config_field(self._config.sdmx.agency, @@ -391,24 +395,25 @@ def _prepare_command(self) -> CommandPlan: if self._config.run.verbose: args.append("--verbose") full_command = [sys.executable, str(SDMX_CLI_PATH)] + args - self._plan = CommandPlan(full_command=full_command, - output_path=output_path) - return self._plan + self._context = DownloadMetadataStep._StepContext( + full_command=full_command, output_path=output_path) + return self._context def run(self) -> None: - plan = self._prepare_command() + context = self._prepare_command() if self._config.run.verbose: logging.info( - f"Starting SDMX metadata download: {' '.join(plan.full_command)} -> {plan.output_path}" + f"Starting SDMX metadata download: {' '.join(context.full_command)} -> {context.output_path}" ) else: - logging.info(f"Downloading SDMX metadata to {plan.output_path}") - _run_command(plan.full_command, verbose=self._config.run.verbose) + logging.info(f"Downloading SDMX metadata to {context.output_path}") + _run_command(context.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: - plan = self._prepare_command() + context = self._prepare_command() logging.info( - f"{self.name} (dry run): would run {' '.join(plan.full_command)}") + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) class CreateSampleStep(SdmxStep): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index a1be47e6c2..a8bf7fd265 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -648,14 +648,14 @@ def test_download_metadata_step_caches_plan(self) -> None: ) step = DownloadMetadataStep(name="test-step", config=config) - # First call creates plan - plan1 = step._prepare_command() - self.assertIn("download-metadata", plan1.full_command) - self.assertIn("--endpoint=https://example.com", plan1.full_command) + # First call creates context + context1 = step._prepare_command() + self.assertIn("download-metadata", context1.full_command) + self.assertIn("--endpoint=https://example.com", context1.full_command) # Second call returns same object - plan2 = step._prepare_command() - self.assertIs(plan1, plan2) + context2 = step._prepare_command() + self.assertIs(context1, context2) def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -713,16 +713,16 @@ def test_download_data_step_caches_plan(self) -> None: ) step = DownloadDataStep(name="test-step", config=config) - # First call creates plan - plan1 = step._prepare_command() - self.assertIn("download-data", plan1.full_command) - self.assertIn("--endpoint=https://example.com", plan1.full_command) - self.assertIn("--key=test-key", plan1.full_command) - self.assertIn("--param=area=US", plan1.full_command) + # First call creates context + context1 = step._prepare_command() + self.assertIn("download-data", context1.full_command) + self.assertIn("--endpoint=https://example.com", context1.full_command) + self.assertIn("--key=test-key", context1.full_command) + self.assertIn("--param=area=US", context1.full_command) # Second call returns same object - plan2 = step._prepare_command() - self.assertIs(plan1, plan2) + context2 = step._prepare_command() + self.assertIs(context1, context2) def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -776,14 +776,14 @@ def test_create_sample_step_caches_plan(self) -> None: input_path = Path(self._tmpdir) / "demo_data.csv" input_path.write_text("header\nrow1") - # First call creates plan - plan1 = step._prepare_command() - self.assertIn("data_sampler.py", plan1.full_command[1]) - self.assertIn("--sampler_output_rows=500", plan1.full_command) + # First call creates context + context1 = step._prepare_command() + self.assertIn("data_sampler.py", context1.full_command[1]) + self.assertIn("--sampler_output_rows=500", context1.full_command) # Second call returns same object - plan2 = step._prepare_command() - self.assertIs(plan1, plan2) + context2 = step._prepare_command() + self.assertIs(context1, context2) def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( From 9a7e38d720d119b3808fb442ed289c39f0638492 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Tue, 25 Nov 2025 17:49:11 +0000 Subject: [PATCH 25/54] refactor: move `dry_run` from the general `Step` interface to `SdmxStep` and update related tests and callback logic. --- tools/agentic_import/pipeline.py | 4 ---- tools/agentic_import/pipeline_test.py | 6 ------ tools/agentic_import/sdmx_import_pipeline.py | 10 +++++++--- .../sdmx_import_pipeline_test.py | 19 ++++++++++++------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index bf4890b9de..09b22a94dd 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -44,10 +44,6 @@ def version(self) -> int: def run(self) -> None: """Execute the step.""" - @abc.abstractmethod - def dry_run(self) -> None: - """Log a read-only preview of the work to be done.""" - class BaseStep(Step, abc.ABC): """Helper base class that stores mandatory metadata.""" diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index ee19777ef3..52944546f1 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -39,9 +39,6 @@ def run(self) -> None: self.executed = True self._events.append(f"run:{self.name}") - def dry_run(self) -> None: - return None - class _FailingStep(BaseStep): @@ -51,9 +48,6 @@ def __init__(self, *, name: str, version: int) -> None: def run(self) -> None: raise ValueError("boom") - def dry_run(self) -> None: - return None - class PipelineRunnerTest(unittest.TestCase): diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 8abac73aaa..5fd8e02f79 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -15,6 +15,7 @@ from __future__ import annotations +import abc import copy import hashlib import json @@ -119,8 +120,9 @@ class InteractiveCallback(PipelineCallback): """Prompts the user before each step runs.""" def before_step(self, step: Step) -> None: - logging.info(f"Dry run for {step.name} (v{step.version}):") - step.dry_run() + if isinstance(step, SdmxStep): + logging.info(f"Dry run for {step.name} (v{step.version}):") + step.dry_run() prompt = f"Run step {step.name} (v{step.version})? [Y/n] " response = input(prompt).strip().lower() if response in ("n", "no"): @@ -295,7 +297,9 @@ def name(self) -> str: def version(self) -> int: return self._version - # Subclasses must implement run() and dry_run(). + @abc.abstractmethod + def dry_run(self) -> None: + """Log a read-only preview of the work to be done.""" class DownloadDataStep(SdmxStep): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index a8bf7fd265..6431dd0224 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -47,10 +47,13 @@ InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep, - _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig) + _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig, + SdmxStep) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) +_DUMMY_CONFIG = PipelineConfig(run=RunConfig(command="test")) + class _IncrementingClock: @@ -67,10 +70,10 @@ def __call__(self) -> datetime: return self._value -class _RecordingStep(BaseStep): +class _RecordingStep(SdmxStep): def __init__(self, name: str, *, should_fail: bool = False) -> None: - super().__init__(name=name, version=1) + super().__init__(name=name, version=1, config=_DUMMY_CONFIG) self._should_fail = should_fail def run(self) -> None: @@ -81,10 +84,10 @@ def dry_run(self) -> None: logging.info("noop") -class _VersionedStep(BaseStep): +class _VersionedStep(SdmxStep): def __init__(self, name: str, version: int) -> None: - super().__init__(name=name, version=version) + super().__init__(name=name, version=version, config=_DUMMY_CONFIG) def run(self) -> None: logging.info("noop") @@ -196,10 +199,12 @@ def test_abort_skips_state_persistence(self) -> None: json.dump(previous, fp) callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock) - class _AbortStep(BaseStep): + class _AbortStep(SdmxStep): def __init__(self) -> None: - super().__init__(name="download.download-data", version=1) + super().__init__(name="download.download-data", + version=1, + config=_DUMMY_CONFIG) def run(self) -> None: raise PipelineAbort("user requested stop") From 8fed16a33245a5b886338ea0b24ed31b61213d99 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 06:59:03 +0000 Subject: [PATCH 26/54] feat: Enable CreateSchemaMapStep to execute pvmap_generator.py and add gemini_cli flag. --- tools/agentic_import/sdmx_import_pipeline.py | 56 +++++++++++- .../sdmx_import_pipeline_test.py | 90 ++++++++++++++++++- 2 files changed, 142 insertions(+), 4 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 5fd8e02f79..f7c7bf5850 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -38,6 +38,7 @@ SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py" +PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py" # Flag names _FLAG_SDMX_ENDPOINT = "sdmx.endpoint" @@ -109,6 +110,9 @@ def _define_flags() -> None: flags.DEFINE_boolean("skip_confirmation", False, "Skip interactive confirmation prompts.") + flags.DEFINE_string("gemini_cli", "gemini", + "Path to Gemini CLI executable.") + def _format_time(value: datetime) -> str: if value.tzinfo is None: @@ -248,6 +252,7 @@ class RunConfig: force: bool = False verbose: bool = False skip_confirmation: bool = False + gemini_cli: str | None = None @dataclass(frozen=True) @@ -479,16 +484,62 @@ class CreateSchemaMapStep(SdmxStep): VERSION = 1 + @dataclass(frozen=True) + class _StepContext: + sample_path: Path + metadata_path: Path + output_prefix: Path + full_command: list[str] + def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) + self._context: CreateSchemaMapStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir) + sample_path = working_dir / f"{dataset_prefix}_sample.csv" + metadata_path = working_dir / f"{dataset_prefix}_metadata.xml" + output_prefix = working_dir / dataset_prefix + + if not sample_path.is_file(): + raise RuntimeError(f"Sample file missing: {sample_path}") + if not metadata_path.is_file(): + raise RuntimeError(f"Metadata file missing: {metadata_path}") + + args = [ + f"--input_data={sample_path}", + f"--input_metadata={metadata_path}", + "--sdmx_dataset", + f"--output_path={output_prefix}", + ] + if self._config.run.skip_confirmation: + args.append("--skip_confirmation") + if self._config.run.gemini_cli: + args.append(f"--gemini_cli={self._config.run.gemini_cli}") + + full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args + self._context = CreateSchemaMapStep._StepContext( + sample_path=sample_path, + metadata_path=metadata_path, + output_prefix=output_prefix, + full_command=full_command) + return self._context def run(self) -> None: + context = self._prepare_command() logging.info( - f"{self.name}: no-op implementation for VERSION={self.VERSION}") + f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}" + ) + _run_command(context.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: + context = self._prepare_command() logging.info( - f"{self.name} (dry run): previewing schema mapping outputs") + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) class ProcessFullDataStep(SdmxStep): @@ -817,6 +868,7 @@ def prepare_config() -> PipelineConfig: force=FLAGS.force, verbose=FLAGS.verbose, skip_confirmation=FLAGS.skip_confirmation, + gemini_cli=FLAGS.gemini_cli, ), ) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 6431dd0224..37314e7eac 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -47,8 +47,8 @@ InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep, - _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig, - SdmxStep) + CreateSchemaMapStep, _run_command, SdmxConfig, SampleConfig, RunConfig, + SdmxDataflowConfig, SdmxStep) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -497,6 +497,9 @@ def test_run_pipeline_updates_state_and_hash(self) -> None: # Create dummy input file for sampling (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") + # Create dummy sample and metadata files for schema mapping + (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") + (Path(self._tmpdir) / "demo_metadata.xml").write_text("") run_sdmx_pipeline(config=config, now_fn=clock) @@ -537,6 +540,11 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: # Create dummy input file for sampling (sanitized name) (Path(self._tmpdir) / "my_flow_name_2025_data.csv").write_text("header\nrow1") + # Create dummy sample and metadata files for schema mapping + (Path(self._tmpdir) / + "my_flow_name_2025_sample.csv").write_text("header\nrow1") + (Path(self._tmpdir) / + "my_flow_name_2025_metadata.xml").write_text("") run_sdmx_pipeline(config=config, now_fn=_IncrementingClock( datetime(2025, 1, 3, tzinfo=timezone.utc), @@ -571,6 +579,9 @@ def test_hash_change_forces_full_rerun(self) -> None: datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1)) # Create dummy input file for sampling (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") + # Create dummy sample and metadata files for schema mapping + (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") + (Path(self._tmpdir) / "demo_metadata.xml").write_text("") run_sdmx_pipeline(config=config, now_fn=first_clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" @@ -603,6 +614,9 @@ def test_hash_unchanged_skips_rerun(self) -> None: datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1)) # Create dummy input file for sampling (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") + # Create dummy sample and metadata files for schema mapping + (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") + (Path(self._tmpdir) / "demo_metadata.xml").write_text("") run_sdmx_pipeline(config=config, now_fn=initial_clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" @@ -843,6 +857,78 @@ def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None: "Input file missing for sampling"): step.dry_run() + def test_create_schema_map_step_caches_plan(self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + gemini_cli="custom-gemini", + skip_confirmation=True, + ),) + step = CreateSchemaMapStep(name="test-step", config=config) + + # Create dummy input files to satisfy validation + (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") + (Path(self._tmpdir) / "demo_metadata.xml").write_text("") + + # First call creates context + context1 = step._prepare_command() + self.assertIn("pvmap_generator.py", context1.full_command[1]) + self.assertIn("--gemini_cli=custom-gemini", context1.full_command) + self.assertIn("--skip_confirmation", context1.full_command) + + # Second call returns same object + context2 = step._prepare_command() + self.assertIs(context1, context2) + + def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = CreateSchemaMapStep(name="test-step", config=config) + + # Create dummy input files + (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") + (Path(self._tmpdir) / "demo_metadata.xml").write_text("") + + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + step.dry_run() + step.run() + + # Verify dry_run logged the command + self.assertTrue( + any("test-step (dry run): would run" in entry + for entry in logs.output)) + self.assertTrue( + any("pvmap_generator.py" in entry for entry in logs.output)) + + # Verify run called the command with the same args + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + self.assertIn("pvmap_generator.py", args[0][1]) + self.assertTrue(kwargs["verbose"]) + + def test_create_schema_map_step_dry_run_fails_if_input_missing( + self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = CreateSchemaMapStep(name="test-step", config=config) + # No input files created + + with self.assertRaisesRegex(RuntimeError, "Sample file missing"): + step.dry_run() + if __name__ == "__main__": unittest.main() From 26800adae8455f08cc0404b477000f8d851ff93a Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 08:04:45 +0000 Subject: [PATCH 27/54] feat: implement `ProcessFullDataStep` using `stat_var_processor` and introduce distinct sample and final output directories. --- tools/agentic_import/sdmx_import_pipeline.py | 70 ++++++++- .../sdmx_import_pipeline_test.py | 140 ++++++++++++++---- 2 files changed, 175 insertions(+), 35 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index f7c7bf5850..972ca419e8 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -38,8 +38,13 @@ SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py" +STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" / + "stat_var_processor.py") PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py" +SAMPLE_OUTPUT_DIR = Path("sample_output") +FINAL_OUTPUT_DIR = Path("output") + # Flag names _FLAG_SDMX_ENDPOINT = "sdmx.endpoint" _FLAG_SDMX_AGENCY = "sdmx.agency" @@ -502,7 +507,7 @@ def _prepare_command(self) -> _StepContext: working_dir = Path(self._config.run.working_dir) sample_path = working_dir / f"{dataset_prefix}_sample.csv" metadata_path = working_dir / f"{dataset_prefix}_metadata.xml" - output_prefix = working_dir / dataset_prefix + output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix if not sample_path.is_file(): raise RuntimeError(f"Sample file missing: {sample_path}") @@ -530,6 +535,7 @@ def _prepare_command(self) -> _StepContext: def run(self) -> None: context = self._prepare_command() + context.output_prefix.parent.mkdir(parents=True, exist_ok=True) logging.info( f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}" ) @@ -547,15 +553,73 @@ class ProcessFullDataStep(SdmxStep): VERSION = 1 + RUN_OUTPUT_COLUMNS: ClassVar[str] = ( + "observationDate,observationAbout,variableMeasured,value," + "observationPeriod,measurementMethod,unit,scalingFactor") + + @dataclass(frozen=True) + class _StepContext: + input_data_path: Path + pv_map_path: Path + metadata_path: Path + full_command: list[str] + output_prefix: Path + def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) + self._context: ProcessFullDataStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir) + input_data_path = working_dir / f"{dataset_prefix}_data.csv" + pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR / + f"{dataset_prefix}_pvmap.csv") + metadata_path = (working_dir / SAMPLE_OUTPUT_DIR / + f"{dataset_prefix}_metadata.csv") + output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix + + for required in (input_data_path, pv_map_path, metadata_path): + if not required.is_file(): + raise RuntimeError( + f"{self.name} requires existing input: {required}") + + args = [ + f"--input_data={input_data_path}", + f"--pv_map={pv_map_path}", + f"--config_file={metadata_path}", + "--generate_statvar_name=True", + "--skip_constant_csv_columns=False", + f"--output_columns={self.RUN_OUTPUT_COLUMNS}", + f"--output_path={output_prefix}", + ] + full_command = [sys.executable, str(STAT_VAR_PROCESSOR_PATH)] + args + self._context = ProcessFullDataStep._StepContext( + input_data_path=input_data_path, + pv_map_path=pv_map_path, + metadata_path=metadata_path, + full_command=full_command, + output_prefix=output_prefix, + ) + return self._context def run(self) -> None: + context = self._prepare_command() + # Ensure output directory exists + context.output_prefix.parent.mkdir(parents=True, exist_ok=True) logging.info( - f"{self.name}: no-op implementation for VERSION={self.VERSION}") + f"Starting stat_var_processor: input={context.input_data_path} " + f"pvmap={context.pv_map_path} metadata={context.metadata_path} -> " + f"{context.output_prefix}") + _run_command(context.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: - logging.info(f"{self.name} (dry run): previewing full-data processing") + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) class CreateDcConfigStep(SdmxStep): diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 37314e7eac..5a449c4c78 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -47,8 +47,8 @@ InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep, - CreateSchemaMapStep, _run_command, SdmxConfig, SampleConfig, RunConfig, - SdmxDataflowConfig, SdmxStep) + CreateSchemaMapStep, ProcessFullDataStep, _run_command, SdmxConfig, + SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -487,19 +487,26 @@ def setUp(self) -> None: self._mock_run_command = self._run_command_patcher.start() self.addCleanup(self._run_command_patcher.stop) + def _create_test_input_files(self, prefix: str) -> None: + (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") + (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") + (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") + + sample_output_dir = Path(self._tmpdir) / "sample_output" + sample_output_dir.mkdir(parents=True, exist_ok=True) + (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") + (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + def test_run_pipeline_updates_state_and_hash(self) -> None: command = "sdmx run pipeline" config = self._build_config(dataset_prefix="demo", dataflow="df.1", command=command) - clock = _IncrementingClock(datetime(2025, 1, 2, tzinfo=timezone.utc), - timedelta(seconds=2)) + clock = _IncrementingClock(datetime(2025, 1, 1, tzinfo=timezone.utc), + timedelta(seconds=1)) - # Create dummy input file for sampling - (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") - # Create dummy sample and metadata files for schema mapping - (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") - (Path(self._tmpdir) / "demo_metadata.xml").write_text("") + # Create dummy files for ProcessFullDataStep + self._create_test_input_files("demo") run_sdmx_pipeline(config=config, now_fn=clock) @@ -537,17 +544,14 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: config = self._build_config(dataset_prefix=None, dataflow=dataflow, command="sdmx run sanitized") - # Create dummy input file for sampling (sanitized name) - (Path(self._tmpdir) / - "my_flow_name_2025_data.csv").write_text("header\nrow1") - # Create dummy sample and metadata files for schema mapping - (Path(self._tmpdir) / - "my_flow_name_2025_sample.csv").write_text("header\nrow1") - (Path(self._tmpdir) / - "my_flow_name_2025_metadata.xml").write_text("") + + # Create test files for ProcessFullDataStep with sanitized name + sanitized_prefix = "my_flow_name_2025" + self._create_test_input_files(sanitized_prefix) + run_sdmx_pipeline(config=config, now_fn=_IncrementingClock( - datetime(2025, 1, 3, tzinfo=timezone.utc), + datetime(2025, 1, 2, tzinfo=timezone.utc), timedelta(seconds=2))) expected_run_id = "my_flow_name_2025" @@ -576,12 +580,12 @@ def test_hash_change_forces_full_rerun(self) -> None: dataflow="df.2", command="sdmx rerun force") first_clock = _IncrementingClock( - datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1)) - # Create dummy input file for sampling - (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") - # Create dummy sample and metadata files for schema mapping - (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") - (Path(self._tmpdir) / "demo_metadata.xml").write_text("") + datetime(2025, 1, 2, tzinfo=timezone.utc), timedelta(seconds=1)) + + # Create dummy files for ProcessFullDataStep + self._create_test_input_files("demo") + + # Run 1 with original config run_sdmx_pipeline(config=config, now_fn=first_clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" @@ -594,7 +598,7 @@ def test_hash_change_forces_full_rerun(self) -> None: dataflow=updated_dataflow) updated_config = dataclasses.replace(config, sdmx=updated_sdmx) second_clock = _IncrementingClock( - datetime(2025, 1, 5, tzinfo=timezone.utc), timedelta(seconds=1)) + datetime(2025, 1, 3, tzinfo=timezone.utc), timedelta(seconds=1)) run_sdmx_pipeline(config=updated_config, now_fn=second_clock) with state_path.open(encoding="utf-8") as fp: @@ -611,12 +615,12 @@ def test_hash_unchanged_skips_rerun(self) -> None: dataflow="df.3", command="sdmx rerun noop") initial_clock = _IncrementingClock( - datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1)) - # Create dummy input file for sampling - (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1") - # Create dummy sample and metadata files for schema mapping - (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") - (Path(self._tmpdir) / "demo_metadata.xml").write_text("") + datetime(2025, 1, 3, tzinfo=timezone.utc), timedelta(seconds=1)) + + # Create dummy files for ProcessFullDataStep + self._create_test_input_files("demo") + + # Run 1 run_sdmx_pipeline(config=config, now_fn=initial_clock) state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json" @@ -640,6 +644,16 @@ def setUp(self) -> None: self.addCleanup(self._tmpdir_obj.cleanup) self._tmpdir = self._tmpdir_obj.name + def _create_test_input_files(self, prefix: str) -> None: + (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") + (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") + (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") + + sample_output_dir = Path(self._tmpdir) / "sample_output" + sample_output_dir.mkdir(parents=True, exist_ok=True) + (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") + (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: with self.assertLogs(logging.get_absl_logger(), @@ -925,8 +939,70 @@ def test_create_schema_map_step_dry_run_fails_if_input_missing( ),) step = CreateSchemaMapStep(name="test-step", config=config) # No input files created + with self.assertRaises(RuntimeError): + step.dry_run() + + def test_process_full_data_step_caches_plan(self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = ProcessFullDataStep(name="test-step", config=config) - with self.assertRaisesRegex(RuntimeError, "Sample file missing"): + # Create test files to satisfy validation + self._create_test_input_files("demo") + + context1 = step._prepare_command() + context2 = step._prepare_command() + self.assertIs(context1, context2) + + def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = ProcessFullDataStep(name="test-step", config=config) + + # Create test files + self._create_test_input_files("demo") + + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + step.dry_run() + step.run() + + # Verify dry_run logged the command + self.assertTrue( + any("test-step (dry run): would run" in entry + for entry in logs.output)) + self.assertTrue( + any("stat_var_processor.py" in entry for entry in logs.output)) + + # Verify run called the command with the same args + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + self.assertIn("stat_var_processor.py", args[0][1]) + self.assertIn("--input_data=", args[0][2]) + self.assertTrue(kwargs["verbose"]) + + def test_process_full_data_step_run_fails_if_input_missing(self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = ProcessFullDataStep(name="test-step", config=config) + # Missing input files + with self.assertRaises(RuntimeError): + step.run() + with self.assertRaises(RuntimeError): step.dry_run() From 55360a9eb7bc1514c8fddfbfaef54c259e8fa1b6 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 08:21:36 +0000 Subject: [PATCH 28/54] feat: Implement `CreateDcConfigStep` to generate custom DC configurations. --- tools/agentic_import/sdmx_import_pipeline.py | 57 +++++++++++- .../sdmx_import_pipeline_test.py | 87 ++++++++++++++++++- 2 files changed, 140 insertions(+), 4 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 972ca419e8..0b4b18f64a 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -41,6 +41,8 @@ STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" / "stat_var_processor.py") PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py" +DC_CONFIG_GENERATOR_PATH = (REPO_ROOT / "tools" / "agentic_import" / + "generate_custom_dc_config.py") SAMPLE_OUTPUT_DIR = Path("sample_output") FINAL_OUTPUT_DIR = Path("output") @@ -627,15 +629,66 @@ class CreateDcConfigStep(SdmxStep): VERSION = 1 + @dataclass(frozen=True) + class _StepContext: + input_csv: Path + output_config: Path + full_command: list[str] + def __init__(self, *, name: str, config: PipelineConfig) -> None: super().__init__(name=name, version=self.VERSION, config=config) + self._context: CreateDcConfigStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir) + input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv" + output_config = (working_dir / FINAL_OUTPUT_DIR / + f"{dataset_prefix}_config.json") + + endpoint = _require_config_field(self._config.sdmx.endpoint, + _FLAG_SDMX_ENDPOINT, self.name) + agency = _require_config_field(self._config.sdmx.agency, + _FLAG_SDMX_AGENCY, self.name) + dataflow = _require_config_field(self._config.sdmx.dataflow.id, + _FLAG_SDMX_DATAFLOW_ID, self.name) + + dataset_url = (f"{endpoint.rstrip('/')}/data/" + f"{agency},{dataflow},") + + args = [ + f"--input_csv={input_csv}", + f"--output_config={output_config}", + f"--provenance_name={dataflow}", + f"--source_name={agency}", + f"--data_source_url={endpoint}", + f"--dataset_url={dataset_url}", + ] + full_command = [sys.executable, str(DC_CONFIG_GENERATOR_PATH)] + args + self._context = CreateDcConfigStep._StepContext( + input_csv=input_csv, + output_config=output_config, + full_command=full_command) + return self._context def run(self) -> None: + context = self._prepare_command() + if not context.input_csv.is_file(): + raise RuntimeError( + f"{self.name} requires existing input: {context.input_csv}") + logging.info( - f"{self.name}: no-op implementation for VERSION={self.VERSION}") + f"Starting custom DC config generation: input={context.input_csv} -> {context.output_config}" + ) + _run_command(context.full_command, verbose=self._config.run.verbose) def dry_run(self) -> None: - logging.info(f"{self.name} (dry run): previewing DC config creation") + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) class PipelineBuilder: diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 5a449c4c78..d3f14d0ced 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -47,8 +47,8 @@ InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep, - CreateSchemaMapStep, ProcessFullDataStep, _run_command, SdmxConfig, - SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep) + CreateSchemaMapStep, ProcessFullDataStep, CreateDcConfigStep, _run_command, + SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -507,6 +507,10 @@ def test_run_pipeline_updates_state_and_hash(self) -> None: # Create dummy files for ProcessFullDataStep self._create_test_input_files("demo") + # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "demo.csv").write_text("data") run_sdmx_pipeline(config=config, now_fn=clock) @@ -548,6 +552,10 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: # Create test files for ProcessFullDataStep with sanitized name sanitized_prefix = "my_flow_name_2025" self._create_test_input_files(sanitized_prefix) + # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / f"{sanitized_prefix}.csv").write_text("data") run_sdmx_pipeline(config=config, now_fn=_IncrementingClock( @@ -584,6 +592,10 @@ def test_hash_change_forces_full_rerun(self) -> None: # Create dummy files for ProcessFullDataStep self._create_test_input_files("demo") + # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "demo.csv").write_text("data") # Run 1 with original config run_sdmx_pipeline(config=config, now_fn=first_clock) @@ -619,6 +631,10 @@ def test_hash_unchanged_skips_rerun(self) -> None: # Create dummy files for ProcessFullDataStep self._create_test_input_files("demo") + # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "demo.csv").write_text("data") # Run 1 run_sdmx_pipeline(config=config, now_fn=initial_clock) @@ -654,6 +670,29 @@ def _create_test_input_files(self, prefix: str) -> None: (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + def _build_config(self, + dataset_prefix: str | None, + endpoint: str = "https://example.com", + agency: str = "AGENCY", + dataflow: str = "FLOW") -> PipelineConfig: + return PipelineConfig(sdmx=SdmxConfig( + endpoint=endpoint, + agency=agency, + dataflow=SdmxDataflowConfig(id=dataflow)), + run=RunConfig(command="test", + dataset_prefix=dataset_prefix, + working_dir=self._tmpdir)) + + def _create_test_input_files(self, prefix: str) -> None: + (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") + (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") + (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") + + sample_output_dir = Path(self._tmpdir) / "sample_output" + sample_output_dir.mkdir(parents=True, exist_ok=True) + (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") + (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: with self.assertLogs(logging.get_absl_logger(), @@ -1005,6 +1044,50 @@ def test_process_full_data_step_run_fails_if_input_missing(self) -> None: with self.assertRaises(RuntimeError): step.dry_run() + def test_create_dc_config_step_caches_plan(self) -> None: + config = self._build_config(dataset_prefix="demo", + endpoint="https://example.com", + agency="AGENCY", + dataflow="FLOW") + step = CreateDcConfigStep(name="test-step", config=config) + context1 = step._prepare_command() + context2 = step._prepare_command() + self.assertIs(context1, context2) + + def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None: + config = self._build_config(dataset_prefix="demo", + endpoint="https://example.com", + agency="AGENCY", + dataflow="FLOW") + step = CreateDcConfigStep(name="test-step", config=config) + + # Create test files + self._create_test_input_files("demo") + # Create final output dir and input csv + final_output_dir = Path(self._tmpdir) / "output" + final_output_dir.mkdir(parents=True, exist_ok=True) + (final_output_dir / "demo.csv").write_text("data") + + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + step.run() + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + command = args[0] + self.assertIn("generate_custom_dc_config.py", command[1]) + self.assertIn(f"--input_csv={final_output_dir}/demo.csv", command) + self.assertIn( + f"--output_config={final_output_dir}/demo_config.json", command) + self.assertIn("--provenance_name=FLOW", command) + self.assertIn("--source_name=AGENCY", command) + self.assertIn("--data_source_url=https://example.com", command) + self.assertIn("--dataset_url=https://example.com/data/AGENCY,FLOW,", + command) + + with self.assertLogs(logging.get_absl_logger(), level="INFO") as cm: + step.dry_run() + self.assertTrue(any("would run" in msg for msg in cm.output)) + if __name__ == "__main__": unittest.main() From 1926be1dc6961565d49a8711afe3117104a9a0c4 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 08:39:22 +0000 Subject: [PATCH 29/54] refactor: Rename dummy config to `_TEST_CONFIG` and centralize dummy output file creation in tests. --- .../sdmx_import_pipeline_test.py | 56 +++++++------------ 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index d3f14d0ced..2c497197d6 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -52,7 +52,7 @@ from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) -_DUMMY_CONFIG = PipelineConfig(run=RunConfig(command="test")) +_TEST_CONFIG = PipelineConfig(run=RunConfig(command="test")) class _IncrementingClock: @@ -73,7 +73,7 @@ def __call__(self) -> datetime: class _RecordingStep(SdmxStep): def __init__(self, name: str, *, should_fail: bool = False) -> None: - super().__init__(name=name, version=1, config=_DUMMY_CONFIG) + super().__init__(name=name, version=1, config=_TEST_CONFIG) self._should_fail = should_fail def run(self) -> None: @@ -87,7 +87,7 @@ def dry_run(self) -> None: class _VersionedStep(SdmxStep): def __init__(self, name: str, version: int) -> None: - super().__init__(name=name, version=version, config=_DUMMY_CONFIG) + super().__init__(name=name, version=version, config=_TEST_CONFIG) def run(self) -> None: logging.info("noop") @@ -204,7 +204,7 @@ class _AbortStep(SdmxStep): def __init__(self) -> None: super().__init__(name="download.download-data", version=1, - config=_DUMMY_CONFIG) + config=_TEST_CONFIG) def run(self) -> None: raise PipelineAbort("user requested stop") @@ -497,6 +497,10 @@ def _create_test_input_files(self, prefix: str) -> None: (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / f"{prefix}.csv").write_text("output") + def test_run_pipeline_updates_state_and_hash(self) -> None: command = "sdmx run pipeline" config = self._build_config(dataset_prefix="demo", @@ -505,12 +509,8 @@ def test_run_pipeline_updates_state_and_hash(self) -> None: clock = _IncrementingClock(datetime(2025, 1, 1, tzinfo=timezone.utc), timedelta(seconds=1)) - # Create dummy files for ProcessFullDataStep + # Create test files for ProcessFullDataStep self._create_test_input_files("demo") - # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep - output_dir = Path(self._tmpdir) / "output" - output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / "demo.csv").write_text("data") run_sdmx_pipeline(config=config, now_fn=clock) @@ -552,10 +552,6 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None: # Create test files for ProcessFullDataStep with sanitized name sanitized_prefix = "my_flow_name_2025" self._create_test_input_files(sanitized_prefix) - # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep - output_dir = Path(self._tmpdir) / "output" - output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / f"{sanitized_prefix}.csv").write_text("data") run_sdmx_pipeline(config=config, now_fn=_IncrementingClock( @@ -590,12 +586,8 @@ def test_hash_change_forces_full_rerun(self) -> None: first_clock = _IncrementingClock( datetime(2025, 1, 2, tzinfo=timezone.utc), timedelta(seconds=1)) - # Create dummy files for ProcessFullDataStep + # Create test files for ProcessFullDataStep self._create_test_input_files("demo") - # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep - output_dir = Path(self._tmpdir) / "output" - output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / "demo.csv").write_text("data") # Run 1 with original config run_sdmx_pipeline(config=config, now_fn=first_clock) @@ -629,12 +621,8 @@ def test_hash_unchanged_skips_rerun(self) -> None: initial_clock = _IncrementingClock( datetime(2025, 1, 3, tzinfo=timezone.utc), timedelta(seconds=1)) - # Create dummy files for ProcessFullDataStep + # Create test files for ProcessFullDataStep self._create_test_input_files("demo") - # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep - output_dir = Path(self._tmpdir) / "output" - output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / "demo.csv").write_text("data") # Run 1 run_sdmx_pipeline(config=config, now_fn=initial_clock) @@ -670,6 +658,10 @@ def _create_test_input_files(self, prefix: str) -> None: (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / f"{prefix}.csv").write_text("output") + def _build_config(self, dataset_prefix: str | None, endpoint: str = "https://example.com", @@ -683,16 +675,6 @@ def _build_config(self, dataset_prefix=dataset_prefix, working_dir=self._tmpdir)) - def _create_test_input_files(self, prefix: str) -> None: - (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") - (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") - (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") - - sample_output_dir = Path(self._tmpdir) / "sample_output" - sample_output_dir.mkdir(parents=True, exist_ok=True) - (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") - (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") - def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: with self.assertLogs(logging.get_absl_logger(), @@ -844,7 +826,7 @@ def test_create_sample_step_caches_plan(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) - # Create dummy input file to satisfy validation + # Create test input file to satisfy validation input_path = Path(self._tmpdir) / "demo_data.csv" input_path.write_text("header\nrow1") @@ -869,7 +851,7 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) - # Create dummy input file + # Create test input file input_path = Path(self._tmpdir) / "demo_data.csv" input_path.write_text("header\nrow1") @@ -921,7 +903,7 @@ def test_create_schema_map_step_caches_plan(self) -> None: ),) step = CreateSchemaMapStep(name="test-step", config=config) - # Create dummy input files to satisfy validation + # Create test input files to satisfy validation (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") (Path(self._tmpdir) / "demo_metadata.xml").write_text("") @@ -944,7 +926,7 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: ),) step = CreateSchemaMapStep(name="test-step", config=config) - # Create dummy input files + # Create test input files (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") (Path(self._tmpdir) / "demo_metadata.xml").write_text("") From 503ea7d662a22779eee01b77ac86ad99926e9ad1 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 08:45:39 +0000 Subject: [PATCH 30/54] refactor: replace global flags with structured dataclass-based configuration objects and related utility functions. --- tools/agentic_import/sdmx_import_pipeline.py | 304 +++++++++---------- 1 file changed, 152 insertions(+), 152 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 0b4b18f64a..a0ee8c2e36 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -64,6 +64,69 @@ FLAGS = flags.FLAGS +@dataclass(frozen=True) +class SdmxDataflowConfig: + """Configuration for SDMX dataflow.""" + id: str | None = None + key: str | None = None + param: str | None = None + + +@dataclass(frozen=True) +class SdmxConfig: + """Configuration for SDMX data access.""" + endpoint: str | None = None + agency: str | None = None + dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig) + + +@dataclass(frozen=True) +class SampleConfig: + """Configuration for data sampling.""" + rows: int = 1000 + + +@dataclass(frozen=True) +class RunConfig: + """Configuration for pipeline execution.""" + command: str + dataset_prefix: str | None = None + working_dir: str | None = None + run_only: str | None = None + force: bool = False + verbose: bool = False + skip_confirmation: bool = False + gemini_cli: str | None = None + + +@dataclass(frozen=True) +class PipelineConfig: + """Aggregated configuration for the pipeline.""" + sdmx: SdmxConfig = field(default_factory=SdmxConfig) + sample: SampleConfig = field(default_factory=SampleConfig) + run: RunConfig = field(default_factory=lambda: RunConfig(command="python")) + + +@dataclass(frozen=True) +class StepDecision: + """Represents whether a step will run and why.""" + + RUN: ClassVar[str] = "RUN" + SKIP: ClassVar[str] = "SKIP" + + step_name: str + decision: str + reason: str + + +@dataclass(frozen=True) +class BuildResult: + """Output of planning that includes the pipeline and per-step decisions.""" + + pipeline: Pipeline + decisions: list[StepDecision] + + def _require_config_field(value: str | None, field: str, step_name: str) -> str: if value: return value @@ -81,50 +144,62 @@ def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: _run_command(command, verbose=verbose) -def _define_flags() -> None: - flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") - flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT) - - flags.DEFINE_string(_FLAG_SDMX_AGENCY, None, - "Owning SDMX agency identifier.") - flags.mark_flag_as_required(_FLAG_SDMX_AGENCY) - - flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None, - "Target SDMX dataflow identifier.") - flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID) - - flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None, - "Optional SDMX key or filter.") +def _format_time(value: datetime) -> str: + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + return value.isoformat() - flags.DEFINE_string( - _FLAG_SDMX_DATAFLOW_PARAM, None, - "Optional SDMX parameter appended to the dataflow query.") - flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000, - "Number of rows to sample from downloaded data.") +def _sanitize_run_id(dataflow: str) -> str: + normalized = dataflow.lower() + normalized = re.sub(r"[^a-z0-9_]+", "_", normalized) + normalized = re.sub(r"_+", "_", normalized) + return normalized.strip("_") - flags.DEFINE_string( - "dataset_prefix", None, - "Optional dataset prefix to override auto-derived values.") - flags.DEFINE_string("run_only", None, - "Execute only a specific pipeline step by name.") +def _resolve_dataset_prefix(config: PipelineConfig) -> str: + if config.run.dataset_prefix: + return config.run.dataset_prefix + if not config.sdmx.dataflow.id: + raise ValueError( + "dataflow.id or dataset_prefix is required to derive dataset prefix" + ) + sanitized = _sanitize_run_id(config.sdmx.dataflow.id) + if not sanitized: + raise ValueError("dataflow value is invalid after sanitization") + return sanitized - flags.DEFINE_boolean("force", False, "Force all steps to run.") - flags.DEFINE_boolean("verbose", False, "Enable verbose logging.") +def _compute_critical_input_hash(config: PipelineConfig) -> str: + payload = { + _FLAG_SDMX_AGENCY: config.sdmx.agency, + _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id, + _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint, + _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key, + _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param, + } + serialized = json.dumps(payload, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(serialized.encode("utf-8")).hexdigest() - flags.DEFINE_boolean("skip_confirmation", False, - "Skip interactive confirmation prompts.") - flags.DEFINE_string("gemini_cli", "gemini", - "Path to Gemini CLI executable.") +def _resolve_working_dir(config: PipelineConfig) -> Path: + directory = Path(config.run.working_dir or os.getcwd()) + if directory.exists(): + if not directory.is_dir(): + raise ValueError(f"working_dir is not a directory: {directory}") + else: + directory.mkdir(parents=True, exist_ok=True) + return directory -def _format_time(value: datetime) -> str: - if value.tzinfo is None: - value = value.replace(tzinfo=timezone.utc) - return value.isoformat() +def _resolve_config(config: PipelineConfig) -> PipelineConfig: + """Resolves dynamic configuration values and returns a new config.""" + dataset_prefix = _resolve_dataset_prefix(config) + working_dir = _resolve_working_dir(config) + new_run = dataclasses.replace(config.run, + dataset_prefix=dataset_prefix, + working_dir=str(working_dir)) + return dataclasses.replace(config, run=new_run) class InteractiveCallback(PipelineCallback): @@ -227,69 +302,6 @@ def build_pipeline_callback( return CompositeCallback([interactive, json_callback]) -@dataclass(frozen=True) -class SdmxDataflowConfig: - """Configuration for SDMX dataflow.""" - id: str | None = None - key: str | None = None - param: str | None = None - - -@dataclass(frozen=True) -class SdmxConfig: - """Configuration for SDMX data access.""" - endpoint: str | None = None - agency: str | None = None - dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig) - - -@dataclass(frozen=True) -class SampleConfig: - """Configuration for data sampling.""" - rows: int = 1000 - - -@dataclass(frozen=True) -class RunConfig: - """Configuration for pipeline execution.""" - command: str - dataset_prefix: str | None = None - working_dir: str | None = None - run_only: str | None = None - force: bool = False - verbose: bool = False - skip_confirmation: bool = False - gemini_cli: str | None = None - - -@dataclass(frozen=True) -class PipelineConfig: - """Aggregated configuration for the pipeline.""" - sdmx: SdmxConfig = field(default_factory=SdmxConfig) - sample: SampleConfig = field(default_factory=SampleConfig) - run: RunConfig = field(default_factory=lambda: RunConfig(command="python")) - - -@dataclass(frozen=True) -class StepDecision: - """Represents whether a step will run and why.""" - - RUN: ClassVar[str] = "RUN" - SKIP: ClassVar[str] = "SKIP" - - step_name: str - decision: str - reason: str - - -@dataclass(frozen=True) -class BuildResult: - """Output of planning that includes the pipeline and per-step decisions.""" - - pipeline: Pipeline - decisions: list[StepDecision] - - class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" @@ -866,66 +878,14 @@ def build_sdmx_pipeline(*, critical_input_hash: str | None = None) -> Pipeline: builder_steps = steps if steps is not None else build_steps(config) builder = PipelineBuilder(config=config, - state=state, - steps=builder_steps, - critical_input_hash=critical_input_hash) + state=state, + steps=builder_steps, + critical_input_hash=critical_input_hash) result = builder.build() _log_step_decisions(result.decisions) return result.pipeline -def _sanitize_run_id(dataflow: str) -> str: - normalized = dataflow.lower() - normalized = re.sub(r"[^a-z0-9_]+", "_", normalized) - normalized = re.sub(r"_+", "_", normalized) - return normalized.strip("_") - - -def _resolve_dataset_prefix(config: PipelineConfig) -> str: - if config.run.dataset_prefix: - return config.run.dataset_prefix - if not config.sdmx.dataflow.id: - raise ValueError( - "dataflow.id or dataset_prefix is required to derive dataset prefix" - ) - sanitized = _sanitize_run_id(config.sdmx.dataflow.id) - if not sanitized: - raise ValueError("dataflow value is invalid after sanitization") - return sanitized - - -def _compute_critical_input_hash(config: PipelineConfig) -> str: - payload = { - _FLAG_SDMX_AGENCY: config.sdmx.agency, - _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id, - _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint, - _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key, - _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param, - } - serialized = json.dumps(payload, sort_keys=True, separators=(",", ":")) - return hashlib.sha256(serialized.encode("utf-8")).hexdigest() - - -def _resolve_working_dir(config: PipelineConfig) -> Path: - directory = Path(config.run.working_dir or os.getcwd()) - if directory.exists(): - if not directory.is_dir(): - raise ValueError(f"working_dir is not a directory: {directory}") - else: - directory.mkdir(parents=True, exist_ok=True) - return directory - - -def _resolve_config(config: PipelineConfig) -> PipelineConfig: - """Resolves dynamic configuration values and returns a new config.""" - dataset_prefix = _resolve_dataset_prefix(config) - working_dir = _resolve_working_dir(config) - new_run = dataclasses.replace(config.run, - dataset_prefix=dataset_prefix, - working_dir=str(working_dir)) - return dataclasses.replace(config, run=new_run) - - def run_sdmx_pipeline( *, config: PipelineConfig, @@ -990,6 +950,46 @@ def prepare_config() -> PipelineConfig: ) +def _define_flags() -> None: + flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") + flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT) + + flags.DEFINE_string(_FLAG_SDMX_AGENCY, None, + "Owning SDMX agency identifier.") + flags.mark_flag_as_required(_FLAG_SDMX_AGENCY) + + flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None, + "Target SDMX dataflow identifier.") + flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID) + + flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None, + "Optional SDMX key or filter.") + + flags.DEFINE_string( + _FLAG_SDMX_DATAFLOW_PARAM, None, + "Optional SDMX parameter appended to the dataflow query.") + + flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000, + "Number of rows to sample from downloaded data.") + + flags.DEFINE_string( + "dataset_prefix", None, + "Optional dataset prefix to override auto-derived values.") + + flags.DEFINE_string("run_only", None, + "Execute only a specific pipeline step by name.") + + flags.DEFINE_boolean("force", False, "Force all steps to run.") + + flags.DEFINE_boolean("verbose", False, "Enable verbose logging.") + + flags.DEFINE_boolean("skip_confirmation", False, + "Skip interactive confirmation prompts.") + + flags.DEFINE_string("gemini_cli", "gemini", + "Path to Gemini CLI executable.") + + def main(_: list[str]) -> int: config = prepare_config() logging.info(f"SDMX pipeline configuration: {config}") From 58f5d517219ca6d00c803188f1665697ac68fddb Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 08:48:15 +0000 Subject: [PATCH 31/54] feat: Reorder imports and adjust `PipelineBuilder` parameter indentation --- tools/agentic_import/sdmx_import_pipeline.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index a0ee8c2e36..2510a755e1 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -36,6 +36,12 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) +from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, + PipelineAbort, PipelineCallback, + PipelineRunner, RunnerConfig, Step) +from tools.agentic_import.state_handler import (PipelineState, StateHandler, + StepState) + SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py" STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" / @@ -55,12 +61,6 @@ _FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param" _FLAG_SAMPLE_ROWS = "sample.rows" -from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, - PipelineAbort, PipelineCallback, - PipelineRunner, RunnerConfig, Step) -from tools.agentic_import.state_handler import (PipelineState, StateHandler, - StepState) - FLAGS = flags.FLAGS @@ -878,9 +878,9 @@ def build_sdmx_pipeline(*, critical_input_hash: str | None = None) -> Pipeline: builder_steps = steps if steps is not None else build_steps(config) builder = PipelineBuilder(config=config, - state=state, - steps=builder_steps, - critical_input_hash=critical_input_hash) + state=state, + steps=builder_steps, + critical_input_hash=critical_input_hash) result = builder.build() _log_step_decisions(result.decisions) return result.pipeline From 6cc6172e3a84e58fedcfed7a23f33f612639e0e0 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 10:25:47 +0000 Subject: [PATCH 32/54] feat: Add SDMX agentic import pipeline with new documentation, code, and tests. --- tools/agentic_import/sdmx_import_pipeline.md | 89 ++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 tools/agentic_import/sdmx_import_pipeline.md diff --git a/tools/agentic_import/sdmx_import_pipeline.md b/tools/agentic_import/sdmx_import_pipeline.md new file mode 100644 index 0000000000..050f2e60aa --- /dev/null +++ b/tools/agentic_import/sdmx_import_pipeline.md @@ -0,0 +1,89 @@ +# SDMX Agentic Import Pipeline + +The SDMX Agentic Import Pipeline is a Python-based system designed to automate the retrieval and processing of SDMX (Statistical Data and Metadata eXchange) data for Data Commons. It provides a structured, step-based approach to downloading, sampling, mapping, and processing SDMX data into Data Commons artifacts. + +## Overview + +The pipeline orchestrates several tools to handle the end-to-end import process: +1. **Download**: Retrieves data and metadata from SDMX endpoints. +2. **Sample**: Creates a manageable sample of the data for analysis. +3. **Map**: Generates Property-Value (PV) mappings using LLM-based tools. +4. **Process**: Converts the full dataset into Data Commons MCF and CSV formats. +5. **Config**: Generates configuration for custom Data Commons instances. + +## Prerequisites + +Before running the pipeline, ensure you have: +1. **Python Environment**: Set up as described in the [main README](./README.md#step-2-environment-setup). +2. **Gemini CLI**: Installed and configured for schema mapping. +3. **Data Commons API Key**: Set in your environment. + +## Usage + +The pipeline is executed using the `sdmx_import_pipeline.py` script. + +### Basic Command + +```bash +python tools/agentic_import/sdmx_import_pipeline.py \ + --sdmx.endpoint="https://sdmx.example.org/data" \ + --sdmx.agency="AGENCY_ID" \ + --sdmx.dataflow.id="DATAFLOW_ID" \ + --working_dir="/path/to/working/dir" +``` + +### Key Flags + +- `--sdmx.endpoint`: The SDMX API endpoint URL. +- `--sdmx.agency`: The SDMX agency ID. +- `--sdmx.dataflow.id`: The SDMX dataflow ID. +- `--sdmx.dataflow.key`: (Optional) Filter key for data download. +- `--sdmx.dataflow.param`: (Optional) Additional parameters for data download. +- `--working_dir`: Directory for input and output files. +- `--sample.rows`: Number of rows for the sample dataset (default: 1000). +- `--force`: Force re-execution of all steps, ignoring saved state. +- `--verbose`: Enable verbose logging. + +## Pipeline Steps + +The pipeline consists of the following steps, executed in order: + +1. **DownloadDataStep**: Downloads SDMX data to `_data.csv`. +2. **DownloadMetadataStep**: Downloads SDMX metadata to `_metadata.xml`. +3. **CreateSampleStep**: Creates `_sample.csv` from the downloaded data. +4. **CreateSchemaMapStep**: Generates PV map and config in `sample_output/` using `pvmap_generator.py`. +5. **ProcessFullDataStep**: Processes the full data using `stat_var_processor.py` to generate artifacts in `output/`. +6. **CreateDcConfigStep**: Generates `output/_config.json` for custom DC imports. + +## Directory Structure + +The pipeline organizes outputs within the specified `--working_dir`: + +``` +working_dir/ +├── _data.csv # Raw downloaded data +├── _metadata.xml # Raw downloaded metadata +├── _sample.csv # Sampled data +├── .state.json # Pipeline state for resuming runs +├── sample_output/ # Intermediate artifacts from mapping +│ ├── _pvmap.csv +│ └── _metadata.csv +└── output/ # Final Data Commons artifacts + ├── .csv + ├── .mcf + ├── .tmcf + └── _config.json +``` + +## State Management + +The pipeline automatically saves its state to a `.state.json` file in the working directory. +- **Resuming**: If a run is interrupted, running the same command again will resume from the last successful step. +- **Skipping**: Steps that have already completed successfully will be skipped unless `--force` is used. +- **Input Hashing**: The pipeline tracks input configuration. If critical configuration changes, it may trigger re-execution of steps. + +## Troubleshooting + +- **Gemini CLI Errors**: If the schema mapping step fails, check the Gemini CLI logs (usually in `.datacommons/runs/` within the working directory). +- **Missing Data**: Ensure the SDMX endpoint, agency, and dataflow ID are correct. Use `--verbose` to see the exact commands being run. +- **State Issues**: If the pipeline is stuck or behaving unexpectedly, you can delete `.state.json` to reset the state, or use `--force`. From 8e4e7a7db8f60accee9d6e227b01a8985ef02bc6 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 26 Nov 2025 10:42:28 +0000 Subject: [PATCH 33/54] fix: Move input file existence checks to `run` methods, allowing dry runs to succeed without pre-existing files and updating related tests. --- tools/agentic_import/sdmx_import_pipeline.py | 26 +++---- .../sdmx_import_pipeline_test.py | 76 ++++++++++++++----- 2 files changed, 68 insertions(+), 34 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 2510a755e1..1a6b81dd71 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -467,9 +467,6 @@ def _prepare_command(self) -> _StepContext: input_path = working_dir / f"{dataset_prefix}_data.csv" output_path = working_dir / f"{dataset_prefix}_sample.csv" - if not input_path.is_file(): - raise RuntimeError(f"Input file missing for sampling: {input_path}") - args = [ f"--sampler_input={input_path}", f"--sampler_output={output_path}", @@ -483,6 +480,9 @@ def _prepare_command(self) -> _StepContext: def run(self) -> None: context = self._prepare_command() + if not context.input_path.is_file(): + raise RuntimeError( + f"Input file missing for sampling: {context.input_path}") if self._config.run.verbose: logging.info( f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}" @@ -523,11 +523,6 @@ def _prepare_command(self) -> _StepContext: metadata_path = working_dir / f"{dataset_prefix}_metadata.xml" output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix - if not sample_path.is_file(): - raise RuntimeError(f"Sample file missing: {sample_path}") - if not metadata_path.is_file(): - raise RuntimeError(f"Metadata file missing: {metadata_path}") - args = [ f"--input_data={sample_path}", f"--input_metadata={metadata_path}", @@ -549,6 +544,11 @@ def _prepare_command(self) -> _StepContext: def run(self) -> None: context = self._prepare_command() + if not context.sample_path.is_file(): + raise RuntimeError(f"Sample file missing: {context.sample_path}") + if not context.metadata_path.is_file(): + raise RuntimeError( + f"Metadata file missing: {context.metadata_path}") context.output_prefix.parent.mkdir(parents=True, exist_ok=True) logging.info( f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}" @@ -595,11 +595,6 @@ def _prepare_command(self) -> _StepContext: f"{dataset_prefix}_metadata.csv") output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix - for required in (input_data_path, pv_map_path, metadata_path): - if not required.is_file(): - raise RuntimeError( - f"{self.name} requires existing input: {required}") - args = [ f"--input_data={input_data_path}", f"--pv_map={pv_map_path}", @@ -621,6 +616,11 @@ def _prepare_command(self) -> _StepContext: def run(self) -> None: context = self._prepare_command() + for required in (context.input_data_path, context.pv_map_path, + context.metadata_path): + if not required.is_file(): + raise RuntimeError( + f"{self.name} requires existing input: {required}") # Ensure output directory exists context.output_prefix.parent.mkdir(parents=True, exist_ok=True) logging.info( diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 2c497197d6..ba4dbeb6a8 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -826,11 +826,7 @@ def test_create_sample_step_caches_plan(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) - # Create test input file to satisfy validation - input_path = Path(self._tmpdir) / "demo_data.csv" - input_path.write_text("header\nrow1") - - # First call creates context + # No input file created, dry run should still succeed context1 = step._prepare_command() self.assertIn("data_sampler.py", context1.full_command[1]) self.assertIn("--sampler_output_rows=500", context1.full_command) @@ -851,7 +847,7 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) - # Create test input file + # Create test input file for run() input_path = Path(self._tmpdir) / "demo_data.csv" input_path.write_text("header\nrow1") @@ -875,7 +871,9 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: self.assertIn("data_sampler.py", args[0][1]) self.assertTrue(kwargs["verbose"]) - def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None: + self.assertTrue(kwargs["verbose"]) + + def test_create_sample_step_dry_run_succeeds_if_input_missing(self) -> None: config = PipelineConfig( run=RunConfig( command="test", @@ -886,11 +884,24 @@ def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None: sample=SampleConfig(rows=500), ) step = CreateSampleStep(name="test-step", config=config) - # No input file created + # No input file created, dry run should still succeed + step.dry_run() + def test_create_sample_step_run_fails_if_input_missing(self) -> None: + config = PipelineConfig( + run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ), + sample=SampleConfig(rows=500), + ) + step = CreateSampleStep(name="test-step", config=config) + # No input file created, run should fail with self.assertRaisesRegex(RuntimeError, "Input file missing for sampling"): - step.dry_run() + step.run() def test_create_schema_map_step_caches_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -903,9 +914,7 @@ def test_create_schema_map_step_caches_plan(self) -> None: ),) step = CreateSchemaMapStep(name="test-step", config=config) - # Create test input files to satisfy validation - (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") - (Path(self._tmpdir) / "demo_metadata.xml").write_text("") + # No input files created, dry run should still succeed # First call creates context context1 = step._prepare_command() @@ -926,7 +935,7 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: ),) step = CreateSchemaMapStep(name="test-step", config=config) - # Create test input files + # Create test input files for run() (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") (Path(self._tmpdir) / "demo_metadata.xml").write_text("") @@ -950,7 +959,9 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: self.assertIn("pvmap_generator.py", args[0][1]) self.assertTrue(kwargs["verbose"]) - def test_create_schema_map_step_dry_run_fails_if_input_missing( + self.assertTrue(kwargs["verbose"]) + + def test_create_schema_map_step_dry_run_succeeds_if_input_missing( self) -> None: config = PipelineConfig(run=RunConfig( command="test", @@ -959,9 +970,20 @@ def test_create_schema_map_step_dry_run_fails_if_input_missing( verbose=True, ),) step = CreateSchemaMapStep(name="test-step", config=config) - # No input files created + # No input files created, dry run should still succeed + step.dry_run() + + def test_create_schema_map_step_run_fails_if_input_missing(self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = CreateSchemaMapStep(name="test-step", config=config) + # No input files created, run should fail with self.assertRaises(RuntimeError): - step.dry_run() + step.run() def test_process_full_data_step_caches_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -972,8 +994,7 @@ def test_process_full_data_step_caches_plan(self) -> None: ),) step = ProcessFullDataStep(name="test-step", config=config) - # Create test files to satisfy validation - self._create_test_input_files("demo") + # No input files created, dry run should still succeed context1 = step._prepare_command() context2 = step._prepare_command() @@ -1012,6 +1033,21 @@ def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None: self.assertIn("--input_data=", args[0][2]) self.assertTrue(kwargs["verbose"]) + self.assertIn("--input_data=", args[0][2]) + self.assertTrue(kwargs["verbose"]) + + def test_process_full_data_step_dry_run_succeeds_if_input_missing( + self) -> None: + config = PipelineConfig(run=RunConfig( + command="test", + dataset_prefix="demo", + working_dir=self._tmpdir, + verbose=True, + ),) + step = ProcessFullDataStep(name="test-step", config=config) + # Missing input files, dry run should still succeed + step.dry_run() + def test_process_full_data_step_run_fails_if_input_missing(self) -> None: config = PipelineConfig(run=RunConfig( command="test", @@ -1020,11 +1056,9 @@ def test_process_full_data_step_run_fails_if_input_missing(self) -> None: verbose=True, ),) step = ProcessFullDataStep(name="test-step", config=config) - # Missing input files + # Missing input files, run should fail with self.assertRaises(RuntimeError): step.run() - with self.assertRaises(RuntimeError): - step.dry_run() def test_create_dc_config_step_caches_plan(self) -> None: config = self._build_config(dataset_prefix="demo", From 7b4bb8d16652798d9310cb6e33ab45b50c46e57f Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 03:28:22 +0000 Subject: [PATCH 34/54] feat: Introduce `working_dir` flag and encapsulate flag definitions to prevent `Duplicate --- tools/agentic_import/pvmap_generator.py | 86 ++++++++++++------- tools/agentic_import/pvmap_generator_test.py | 48 +++++++++++ tools/agentic_import/sdmx_import_pipeline.py | 23 +++-- .../sdmx_import_pipeline_test.py | 12 +++ 4 files changed, 126 insertions(+), 43 deletions(-) diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py index 2bf0217d98..110a8e68a8 100644 --- a/tools/agentic_import/pvmap_generator.py +++ b/tools/agentic_import/pvmap_generator.py @@ -33,45 +33,60 @@ _FLAGS = flags.FLAGS _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -flags.DEFINE_list('input_data', None, - 'List of input data file paths (required)') -flags.mark_flag_as_required('input_data') -# TODO: Allow users to provide original source path and auto-generate sample data files internally -flags.DEFINE_list('input_metadata', [], - 'List of input metadata file paths (optional)') +def _define_flags(): + try: + flags.DEFINE_list('input_data', None, + 'List of input data file paths (required)') + flags.mark_flag_as_required('input_data') -flags.DEFINE_boolean('sdmx_dataset', False, - 'Whether the dataset is in SDMX format (default: False)') + flags.DEFINE_list('input_metadata', [], + 'List of input metadata file paths (optional)') -flags.DEFINE_boolean('dry_run', False, - 'Generate prompt only without calling Gemini CLI') + flags.DEFINE_boolean( + 'sdmx_dataset', False, + 'Whether the dataset is in SDMX format (default: False)') -flags.DEFINE_string('maps_api_key', None, 'Google Maps API key (optional)') + flags.DEFINE_boolean('dry_run', False, + 'Generate prompt only without calling Gemini CLI') -flags.DEFINE_string('dc_api_key', None, 'Data Commons API key (optional)') + flags.DEFINE_string('maps_api_key', None, + 'Google Maps API key (optional)') -flags.DEFINE_integer('max_iterations', 10, - 'Maximum number of attempts for statvar processor.') + flags.DEFINE_string('dc_api_key', None, + 'Data Commons API key (optional)') -flags.DEFINE_boolean( - 'skip_confirmation', False, - 'Skip user confirmation before starting PV map generation') + flags.DEFINE_integer( + 'max_iterations', 10, + 'Maximum number of attempts for statvar processor.') -flags.DEFINE_boolean( - 'enable_sandboxing', - platform.system() == 'Darwin', - 'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)' -) + flags.DEFINE_boolean( + 'skip_confirmation', False, + 'Skip user confirmation before starting PV map generation') -flags.DEFINE_string( - 'output_path', 'output/output', - 'Output path prefix for all generated files (default: output/output)') + flags.DEFINE_boolean( + 'enable_sandboxing', + platform.system() == 'Darwin', + 'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)' + ) + + flags.DEFINE_string( + 'output_path', 'output/output', + 'Output path prefix for all generated files (default: output/output)' + ) -flags.DEFINE_string( - 'gemini_cli', 'gemini', 'Custom path or command to invoke Gemini CLI. ' - 'Example: "/usr/local/bin/gemini". ' - 'WARNING: This value is executed in a shell - use only with trusted input.') + flags.DEFINE_string( + 'gemini_cli', 'gemini', + 'Custom path or command to invoke Gemini CLI. ' + 'Example: "/usr/local/bin/gemini". ' + 'WARNING: This value is executed in a shell - use only with trusted input.' + ) + + flags.DEFINE_string( + 'working_dir', None, + 'Working directory for the generator (default: current directory)') + except flags.DuplicateFlagError: + pass @dataclass @@ -93,6 +108,7 @@ class Config: enable_sandboxing: bool = False output_path: str = 'output/output' gemini_cli: Optional[str] = None + working_dir: Optional[str] = None @dataclass @@ -110,7 +126,12 @@ class PVMapGenerator: def __init__(self, config: Config): # Define working directory once for consistent path resolution - self._working_dir = Path.cwd() + self._working_dir = Path( + config.working_dir).resolve() if config.working_dir else Path.cwd() + if self._working_dir.exists() and not self._working_dir.is_dir(): + raise ValueError( + f"working_dir is not a directory: {self._working_dir}") + self._working_dir.mkdir(parents=True, exist_ok=True) # Copy config to avoid modifying the original self._config = copy.deepcopy(config) @@ -314,6 +335,7 @@ def _run_subprocess(self, command: str) -> int: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, # Combine stderr with stdout shell=True, # Using shell to support pipe operations + cwd=self._working_dir, # Run in the specified working directory encoding='utf-8', errors='replace', bufsize=1, # Line buffered @@ -406,7 +428,8 @@ def prepare_config() -> Config: skip_confirmation=_FLAGS.skip_confirmation, enable_sandboxing=_FLAGS.enable_sandboxing, output_path=_FLAGS.output_path, - gemini_cli=_FLAGS.gemini_cli) + gemini_cli=_FLAGS.gemini_cli, + working_dir=_FLAGS.working_dir) def main(_): @@ -424,4 +447,5 @@ def main(_): if __name__ == '__main__': + _define_flags() app.run(main) diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py index 32ad242787..293ecafa47 100644 --- a/tools/agentic_import/pvmap_generator_test.py +++ b/tools/agentic_import/pvmap_generator_test.py @@ -178,6 +178,54 @@ def test_rejects_paths_outside_working_directory(self): input_data=[str(external_file)], input_metadata=[]), dry_run=True)) + def test_generate_prompt_with_relative_working_dir(self): + # Create a subdirectory for the relative working directory test + sub_dir_name = 'sub_working_dir' + sub_dir = Path(self._temp_dir.name) / sub_dir_name + sub_dir.mkdir() + + # Create input files inside the subdirectory + data_file = sub_dir / 'input.csv' + data_file.write_text('header\nvalue') + metadata_file = sub_dir / 'metadata.csv' + metadata_file.write_text('parameter,value') + + # Use relative path for working_dir + config = Config( + data_config=DataConfig( + input_data=[ + str(data_file.relative_to(Path(self._temp_dir.name))) + ], # Relative to PWD + input_metadata=[ + str(metadata_file.relative_to(Path(self._temp_dir.name))) + ], # Relative to PWD + is_sdmx_dataset=False, + ), + dry_run=True, + max_iterations=3, + output_path='output/output_file', + working_dir=sub_dir_name, # Relative path + ) + + # We need to run from the parent directory so the relative path is valid + # The setUp already changed to self._temp_dir.name, so we are in the right place + + generator = PVMapGenerator(config) + result = generator.generate() + + self._assert_generation_result(result) + prompt_path = self._read_prompt_path(result) + prompt_text = prompt_path.read_text() + + # Verify that the working directory in the prompt is the absolute path of the subdirectory + expected_working_dir = str(sub_dir.resolve()) + self.assertIn(expected_working_dir, prompt_text) + self.assertIn(f'"working_dir": "{expected_working_dir}"', prompt_text) + + # Verify input paths are also absolute in the prompt + self.assertIn(str(data_file.resolve()), prompt_text) + self.assertIn(str(metadata_file.resolve()), prompt_text) + if __name__ == '__main__': unittest.main() diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 1a6b81dd71..d3224e587a 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -183,12 +183,10 @@ def _compute_critical_input_hash(config: PipelineConfig) -> str: def _resolve_working_dir(config: PipelineConfig) -> Path: - directory = Path(config.run.working_dir or os.getcwd()) - if directory.exists(): - if not directory.is_dir(): - raise ValueError(f"working_dir is not a directory: {directory}") - else: - directory.mkdir(parents=True, exist_ok=True) + directory = Path(config.run.working_dir or os.getcwd()).resolve() + if directory.exists() and not directory.is_dir(): + raise ValueError(f"working_dir is not a directory: {directory}") + directory.mkdir(parents=True, exist_ok=True) return directory @@ -350,7 +348,7 @@ def _prepare_command(self) -> _StepContext: dataflow = _require_config_field(self._config.sdmx.dataflow.id, _FLAG_SDMX_DATAFLOW_ID, self.name) dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir) + working_dir = Path(self._config.run.working_dir).resolve() output_path = working_dir / f"{dataset_prefix}_data.csv" args = [ "download-data", @@ -411,7 +409,7 @@ def _prepare_command(self) -> _StepContext: dataflow = _require_config_field(self._config.sdmx.dataflow.id, _FLAG_SDMX_DATAFLOW_ID, self.name) dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir) + working_dir = Path(self._config.run.working_dir).resolve() output_path = working_dir / f"{dataset_prefix}_metadata.xml" args = [ "download-metadata", @@ -463,7 +461,7 @@ def _prepare_command(self) -> _StepContext: if self._context: return self._context dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir) + working_dir = Path(self._config.run.working_dir).resolve() input_path = working_dir / f"{dataset_prefix}_data.csv" output_path = working_dir / f"{dataset_prefix}_sample.csv" @@ -518,7 +516,7 @@ def _prepare_command(self) -> _StepContext: if self._context: return self._context dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir) + working_dir = Path(self._config.run.working_dir).resolve() sample_path = working_dir / f"{dataset_prefix}_sample.csv" metadata_path = working_dir / f"{dataset_prefix}_metadata.xml" output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix @@ -533,6 +531,7 @@ def _prepare_command(self) -> _StepContext: args.append("--skip_confirmation") if self._config.run.gemini_cli: args.append(f"--gemini_cli={self._config.run.gemini_cli}") + args.append(f"--working_dir={working_dir}") full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args self._context = CreateSchemaMapStep._StepContext( @@ -587,7 +586,7 @@ def _prepare_command(self) -> _StepContext: if self._context: return self._context dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir) + working_dir = Path(self._config.run.working_dir).resolve() input_data_path = working_dir / f"{dataset_prefix}_data.csv" pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR / f"{dataset_prefix}_pvmap.csv") @@ -655,7 +654,7 @@ def _prepare_command(self) -> _StepContext: if self._context: return self._context dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir) + working_dir = Path(self._config.run.working_dir).resolve() input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv" output_config = (working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}_config.json") diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index ba4dbeb6a8..7b050d80fd 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -710,6 +710,7 @@ def test_download_metadata_step_caches_plan(self) -> None: # Second call returns same object context2 = step._prepare_command() self.assertIs(context1, context2) + self.assertTrue(context1.output_path.is_absolute()) def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -777,6 +778,7 @@ def test_download_data_step_caches_plan(self) -> None: # Second call returns same object context2 = step._prepare_command() self.assertIs(context1, context2) + self.assertTrue(context1.output_path.is_absolute()) def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -834,6 +836,7 @@ def test_create_sample_step_caches_plan(self) -> None: # Second call returns same object context2 = step._prepare_command() self.assertIs(context1, context2) + self.assertTrue(context1.output_path.is_absolute()) def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -925,6 +928,9 @@ def test_create_schema_map_step_caches_plan(self) -> None: # Second call returns same object context2 = step._prepare_command() self.assertIs(context1, context2) + self.assertTrue(context1.sample_path.is_absolute()) + self.assertTrue(context1.metadata_path.is_absolute()) + self.assertTrue(context1.output_prefix.is_absolute()) def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -999,6 +1005,10 @@ def test_process_full_data_step_caches_plan(self) -> None: context1 = step._prepare_command() context2 = step._prepare_command() self.assertIs(context1, context2) + self.assertTrue(context1.input_data_path.is_absolute()) + self.assertTrue(context1.pv_map_path.is_absolute()) + self.assertTrue(context1.metadata_path.is_absolute()) + self.assertTrue(context1.output_prefix.is_absolute()) def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -1069,6 +1079,8 @@ def test_create_dc_config_step_caches_plan(self) -> None: context1 = step._prepare_command() context2 = step._prepare_command() self.assertIs(context1, context2) + self.assertTrue(context1.input_csv.is_absolute()) + self.assertTrue(context1.output_config.is_absolute()) def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None: config = self._build_config(dataset_prefix="demo", From 5fd7e25830d5d29a04f9f3cbdefef294462a2c9f Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 03:57:58 +0000 Subject: [PATCH 35/54] docs: clarify SDMX import pipeline prerequisites, usage, step names, and state management. --- tools/agentic_import/sdmx_import_pipeline.md | 41 +++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.md b/tools/agentic_import/sdmx_import_pipeline.md index 050f2e60aa..32f7eae252 100644 --- a/tools/agentic_import/sdmx_import_pipeline.md +++ b/tools/agentic_import/sdmx_import_pipeline.md @@ -7,29 +7,38 @@ The SDMX Agentic Import Pipeline is a Python-based system designed to automate t The pipeline orchestrates several tools to handle the end-to-end import process: 1. **Download**: Retrieves data and metadata from SDMX endpoints. 2. **Sample**: Creates a manageable sample of the data for analysis. -3. **Map**: Generates Property-Value (PV) mappings using LLM-based tools. -4. **Process**: Converts the full dataset into Data Commons MCF and CSV formats. -5. **Config**: Generates configuration for custom Data Commons instances. +3. **Schema Mapping**: Generates Property-Value (PV) mappings using LLM-based tools. +4. **Full Data Processing**: Converts the full dataset into Data Commons MCF and CSV formats. +5. **Custom DC Config**: Generates configuration for custom Data Commons instances. ## Prerequisites -Before running the pipeline, ensure you have: -1. **Python Environment**: Set up as described in the [main README](./README.md#step-2-environment-setup). -2. **Gemini CLI**: Installed and configured for schema mapping. -3. **Data Commons API Key**: Set in your environment. +Before running the pipeline, ensure you have set up your environment as described in the [main README](./README.md#step-2-environment-setup). Key requirements include: + +1. **DC_DATA_REPO_PATH**: Environment variable pointing to your cloned Data Commons data repository. +2. **WORKING_DIR**: Environment variable pointing to your working directory. +3. **Python Environment**: Activated virtual environment with required dependencies. +4. **Gemini CLI**: Installed and configured for schema mapping. +5. **Data Commons API Key**: Set in your environment. ## Usage The pipeline is executed using the `sdmx_import_pipeline.py` script. +**Important:** The command must be run from within your working directory. + ### Basic Command ```bash -python tools/agentic_import/sdmx_import_pipeline.py \ +# Ensure you are in your working directory +cd $WORKING_DIR + +# Run the pipeline using the full path to the script +python $DC_DATA_REPO_PATH/tools/agentic_import/sdmx_import_pipeline.py \ --sdmx.endpoint="https://sdmx.example.org/data" \ --sdmx.agency="AGENCY_ID" \ --sdmx.dataflow.id="DATAFLOW_ID" \ - --working_dir="/path/to/working/dir" + --dataset_prefix="my_dataset" ``` ### Key Flags @@ -39,9 +48,10 @@ python tools/agentic_import/sdmx_import_pipeline.py \ - `--sdmx.dataflow.id`: The SDMX dataflow ID. - `--sdmx.dataflow.key`: (Optional) Filter key for data download. - `--sdmx.dataflow.param`: (Optional) Additional parameters for data download. -- `--working_dir`: Directory for input and output files. +- `--dataset_prefix`: (Optional) Prefix for generated artifacts. Useful for disambiguating multiple datasets in the same working directory. If not provided, it is derived from the dataflow ID. - `--sample.rows`: Number of rows for the sample dataset (default: 1000). - `--force`: Force re-execution of all steps, ignoring saved state. +- `--skip_confirmation`: Skip interactive confirmation prompts during schema mapping. - `--verbose`: Enable verbose logging. ## Pipeline Steps @@ -57,14 +67,15 @@ The pipeline consists of the following steps, executed in order: ## Directory Structure -The pipeline organizes outputs within the specified `--working_dir`: +The pipeline organizes outputs within the specified working directory: ``` working_dir/ ├── _data.csv # Raw downloaded data ├── _metadata.xml # Raw downloaded metadata ├── _sample.csv # Sampled data -├── .state.json # Pipeline state for resuming runs +├── .datacommons/ +│ └── .state.json # Pipeline state for resuming runs ├── sample_output/ # Intermediate artifacts from mapping │ ├── _pvmap.csv │ └── _metadata.csv @@ -77,13 +88,13 @@ working_dir/ ## State Management -The pipeline automatically saves its state to a `.state.json` file in the working directory. +The pipeline automatically saves its state to a `.state.json` file in the `.datacommons/` directory within your working directory. - **Resuming**: If a run is interrupted, running the same command again will resume from the last successful step. - **Skipping**: Steps that have already completed successfully will be skipped unless `--force` is used. - **Input Hashing**: The pipeline tracks input configuration. If critical configuration changes, it may trigger re-execution of steps. ## Troubleshooting -- **Gemini CLI Errors**: If the schema mapping step fails, check the Gemini CLI logs (usually in `.datacommons/runs/` within the working directory). +- **Gemini CLI Errors**: If the schema mapping step fails, check the Gemini CLI logs (usually in `.datacommons/runs/` within the working directory). Refer to the [main README](./README.md#debugging) for detailed debugging instructions. - **Missing Data**: Ensure the SDMX endpoint, agency, and dataflow ID are correct. Use `--verbose` to see the exact commands being run. -- **State Issues**: If the pipeline is stuck or behaving unexpectedly, you can delete `.state.json` to reset the state, or use `--force`. +- **State Issues**: If the pipeline is stuck or behaving unexpectedly, you can delete the state file to reset the state, or use `--force`. From eff87058f257e713fc06bedbfe0dd2efe511a5c7 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 04:10:41 +0000 Subject: [PATCH 36/54] refactor: extract common SDMX test setup and helper methods into a new `SdmxTestBase` class. --- .../sdmx_import_pipeline_test.py | 110 +++++++----------- 1 file changed, 45 insertions(+), 65 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 7b050d80fd..73bad6e7ba 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -455,14 +455,37 @@ def test_incremental_records_skip_reasons(self) -> None: self.assertIn("up-to-date", decision.reason) -class RunPipelineTest(unittest.TestCase): +class SdmxTestBase(unittest.TestCase): - def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None, - command: str) -> PipelineConfig: + def setUp(self) -> None: + self._tmpdir_obj = tempfile.TemporaryDirectory() + self.addCleanup(self._tmpdir_obj.cleanup) + self._tmpdir = self._tmpdir_obj.name + + def _create_test_input_files(self, prefix: str) -> None: + (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") + (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") + (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") + + sample_output_dir = Path(self._tmpdir) / "sample_output" + sample_output_dir.mkdir(parents=True, exist_ok=True) + (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") + (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") + + output_dir = Path(self._tmpdir) / "output" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / f"{prefix}.csv").write_text("output") + + def _build_config(self, + dataset_prefix: str | None, + dataflow: str | None = "FLOW", + command: str = "test", + endpoint: str = "https://example.com", + agency: str = "AGENCY") -> PipelineConfig: return PipelineConfig( sdmx=SdmxConfig( - endpoint="https://api.example.com", - agency="TEST_AGENCY", + endpoint=endpoint, + agency=agency, dataflow=SdmxDataflowConfig( id=dataflow, key="test-key", @@ -477,30 +500,17 @@ def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None, ), ) + +class RunPipelineTest(SdmxTestBase): + def setUp(self) -> None: - self._tmpdir_obj = tempfile.TemporaryDirectory() - self.addCleanup(self._tmpdir_obj.cleanup) - self._tmpdir = self._tmpdir_obj.name + super().setUp() # Mock _run_command to avoid actual execution during pipeline tests self._run_command_patcher = mock.patch( "tools.agentic_import.sdmx_import_pipeline._run_command") self._mock_run_command = self._run_command_patcher.start() self.addCleanup(self._run_command_patcher.stop) - def _create_test_input_files(self, prefix: str) -> None: - (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") - (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") - (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") - - sample_output_dir = Path(self._tmpdir) / "sample_output" - sample_output_dir.mkdir(parents=True, exist_ok=True) - (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") - (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") - - output_dir = Path(self._tmpdir) / "output" - output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / f"{prefix}.csv").write_text("output") - def test_run_pipeline_updates_state_and_hash(self) -> None: command = "sdmx run pipeline" config = self._build_config(dataset_prefix="demo", @@ -641,39 +651,7 @@ def test_hash_unchanged_skips_rerun(self) -> None: self.assertEqual(first_state, second_state) -class SdmxStepTest(unittest.TestCase): - - def setUp(self) -> None: - self._tmpdir_obj = tempfile.TemporaryDirectory() - self.addCleanup(self._tmpdir_obj.cleanup) - self._tmpdir = self._tmpdir_obj.name - - def _create_test_input_files(self, prefix: str) -> None: - (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data") - (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample") - (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata") - - sample_output_dir = Path(self._tmpdir) / "sample_output" - sample_output_dir.mkdir(parents=True, exist_ok=True) - (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap") - (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata") - - output_dir = Path(self._tmpdir) / "output" - output_dir.mkdir(parents=True, exist_ok=True) - (output_dir / f"{prefix}.csv").write_text("output") - - def _build_config(self, - dataset_prefix: str | None, - endpoint: str = "https://example.com", - agency: str = "AGENCY", - dataflow: str = "FLOW") -> PipelineConfig: - return PipelineConfig(sdmx=SdmxConfig( - endpoint=endpoint, - agency=agency, - dataflow=SdmxDataflowConfig(id=dataflow)), - run=RunConfig(command="test", - dataset_prefix=dataset_prefix, - working_dir=self._tmpdir)) +class SdmxStepTest(SdmxTestBase): def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: @@ -830,7 +808,8 @@ def test_create_sample_step_caches_plan(self) -> None: # No input file created, dry run should still succeed context1 = step._prepare_command() - self.assertIn("data_sampler.py", context1.full_command[1]) + self.assertTrue( + any("data_sampler.py" in arg for arg in context1.full_command)) self.assertIn("--sampler_output_rows=500", context1.full_command) # Second call returns same object @@ -871,7 +850,7 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: # Verify run called the command with the same args mock_run_cmd.assert_called_once() args, kwargs = mock_run_cmd.call_args - self.assertIn("data_sampler.py", args[0][1]) + self.assertTrue(any("data_sampler.py" in arg for arg in args[0])) self.assertTrue(kwargs["verbose"]) self.assertTrue(kwargs["verbose"]) @@ -921,7 +900,8 @@ def test_create_schema_map_step_caches_plan(self) -> None: # First call creates context context1 = step._prepare_command() - self.assertIn("pvmap_generator.py", context1.full_command[1]) + self.assertTrue( + any("pvmap_generator.py" in arg for arg in context1.full_command)) self.assertIn("--gemini_cli=custom-gemini", context1.full_command) self.assertIn("--skip_confirmation", context1.full_command) @@ -962,7 +942,7 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: # Verify run called the command with the same args mock_run_cmd.assert_called_once() args, kwargs = mock_run_cmd.call_args - self.assertIn("pvmap_generator.py", args[0][1]) + self.assertTrue(any("pvmap_generator.py" in arg for arg in args[0])) self.assertTrue(kwargs["verbose"]) self.assertTrue(kwargs["verbose"]) @@ -1039,11 +1019,10 @@ def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None: # Verify run called the command with the same args mock_run_cmd.assert_called_once() args, kwargs = mock_run_cmd.call_args - self.assertIn("stat_var_processor.py", args[0][1]) - self.assertIn("--input_data=", args[0][2]) - self.assertTrue(kwargs["verbose"]) - - self.assertIn("--input_data=", args[0][2]) + self.assertTrue( + any("stat_var_processor.py" in arg for arg in args[0])) + self.assertTrue( + any(arg.startswith("--input_data=") for arg in args[0])) self.assertTrue(kwargs["verbose"]) def test_process_full_data_step_dry_run_succeeds_if_input_missing( @@ -1102,7 +1081,8 @@ def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None: mock_run_cmd.assert_called_once() args, kwargs = mock_run_cmd.call_args command = args[0] - self.assertIn("generate_custom_dc_config.py", command[1]) + self.assertTrue( + any("generate_custom_dc_config.py" in arg for arg in command)) self.assertIn(f"--input_csv={final_output_dir}/demo.csv", command) self.assertIn( f"--output_config={final_output_dir}/demo_config.json", command) From 11abd361efaf112b6b5ade660bd4a05f154ab8d0 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 04:55:09 +0000 Subject: [PATCH 37/54] feat: Resolve relative input and output paths against the working directory in PVMapGenerator. --- tools/agentic_import/pvmap_generator.py | 14 +++++--- tools/agentic_import/pvmap_generator_test.py | 38 ++++++++++++++++---- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py index 110a8e68a8..60f4d9a955 100644 --- a/tools/agentic_import/pvmap_generator.py +++ b/tools/agentic_import/pvmap_generator.py @@ -151,13 +151,16 @@ def __init__(self, config: Config): ] # Parse output_path into directory and basename components - output_path = Path(self._config.output_path) + # Parse output_path, handling relative paths and ~ expansion + output_path = Path(self._config.output_path).expanduser() + if not output_path.is_absolute(): + output_path = self._working_dir / output_path + self._output_dir = output_path.parent self._output_basename = output_path.name # Create output directory if it doesn't exist - output_full_dir = self._working_dir / self._output_dir - output_full_dir.mkdir(parents=True, exist_ok=True) + self._output_dir.mkdir(parents=True, exist_ok=True) self._datacommons_dir = self._initialize_datacommons_dir() @@ -171,7 +174,10 @@ def __init__(self, config: Config): def _validate_and_convert_path(self, path: str) -> Path: """Convert path to absolute and validate it's within working directory.""" - real_path = Path(path).expanduser().resolve() + p = Path(path).expanduser() + if not p.is_absolute(): + p = self._working_dir / p + real_path = p.resolve() working_dir = self._working_dir.resolve() try: real_path.relative_to(working_dir) diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py index 293ecafa47..0095770065 100644 --- a/tools/agentic_import/pvmap_generator_test.py +++ b/tools/agentic_import/pvmap_generator_test.py @@ -193,12 +193,8 @@ def test_generate_prompt_with_relative_working_dir(self): # Use relative path for working_dir config = Config( data_config=DataConfig( - input_data=[ - str(data_file.relative_to(Path(self._temp_dir.name))) - ], # Relative to PWD - input_metadata=[ - str(metadata_file.relative_to(Path(self._temp_dir.name))) - ], # Relative to PWD + input_data=['input.csv'], # Relative to working_dir + input_metadata=['metadata.csv'], # Relative to working_dir is_sdmx_dataset=False, ), dry_run=True, @@ -226,6 +222,36 @@ def test_generate_prompt_with_relative_working_dir(self): self.assertIn(str(data_file.resolve()), prompt_text) self.assertIn(str(metadata_file.resolve()), prompt_text) + def test_relative_paths_resolved_against_working_dir(self): + # Create a separate working directory + with tempfile.TemporaryDirectory() as work_dir: + work_path = Path(work_dir) + # Create input files inside the working directory + data_file = work_path / 'input.csv' + data_file.write_text('header\nvalue') + + # Run from a different directory (current temp dir) + # Use relative path to input file, which should be resolved against work_dir + config = Config( + data_config=DataConfig( + input_data=['input.csv'], # Relative to work_dir + input_metadata=[], + is_sdmx_dataset=False, + ), + dry_run=True, + working_dir=work_dir, + ) + + # This should not raise ValueError because input.csv is found in work_dir + generator = PVMapGenerator(config) + result = generator.generate() + self._assert_generation_result(result) + self.assertEqual(str(generator._config.data_config.input_data[0]), + str(data_file.resolve())) + # Verify output directory is also under working_dir + self.assertTrue( + str(generator._output_dir).startswith(str(work_path.resolve()))) + if __name__ == '__main__': unittest.main() From b08f8ca4a26e2891e1d5d8150bbc563de2cafc61 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 06:15:12 +0000 Subject: [PATCH 38/54] feat: Add working directory flag to SDMX import pipeline and ensure absolute output path in pvmap generator. --- tools/agentic_import/pvmap_generator.py | 1 + tools/agentic_import/pvmap_generator_test.py | 3 +++ tools/agentic_import/sdmx_import_pipeline.py | 5 ++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py index 60f4d9a955..cdbac209fe 100644 --- a/tools/agentic_import/pvmap_generator.py +++ b/tools/agentic_import/pvmap_generator.py @@ -158,6 +158,7 @@ def __init__(self, config: Config): self._output_dir = output_path.parent self._output_basename = output_path.name + self._config.output_path = str(output_path) # Create output directory if it doesn't exist self._output_dir.mkdir(parents=True, exist_ok=True) diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py index 0095770065..cad6f360ab 100644 --- a/tools/agentic_import/pvmap_generator_test.py +++ b/tools/agentic_import/pvmap_generator_test.py @@ -116,6 +116,9 @@ def _assert_prompt_content(self, prompt_path: Path, *, expect_sdmx: bool, self.assertIn(f'You have exactly {config.max_iterations} attempts', prompt_text) + # Output path should be absolute in the prompt + self.assertIn(f'--output-path "{config.output_path}"', prompt_text) + if expect_sdmx: # SDMX prompts highlight dataset type and show SDMX-specific banner. self.assertIn('"dataset_type": "sdmx"', prompt_text) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index d3224e587a..3c933f5d5a 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -939,7 +939,7 @@ def prepare_config() -> PipelineConfig: run=RunConfig( command=command, dataset_prefix=FLAGS.dataset_prefix, - working_dir=None, + working_dir=FLAGS.working_dir, run_only=FLAGS.run_only, force=FLAGS.force, verbose=FLAGS.verbose, @@ -988,6 +988,9 @@ def _define_flags() -> None: flags.DEFINE_string("gemini_cli", "gemini", "Path to Gemini CLI executable.") + flags.DEFINE_string("working_dir", None, + "Working directory for the pipeline.") + def main(_: list[str]) -> int: config = prepare_config() From cd9041a403f4b6d42bd0b46d7058210800291fec Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 06:19:27 +0000 Subject: [PATCH 39/54] docs: Consolidate comments for output_path parsing logic. --- tools/agentic_import/pvmap_generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py index cdbac209fe..0edbd0f88b 100644 --- a/tools/agentic_import/pvmap_generator.py +++ b/tools/agentic_import/pvmap_generator.py @@ -150,8 +150,7 @@ def __init__(self, config: Config): for path in self._config.data_config.input_metadata ] - # Parse output_path into directory and basename components - # Parse output_path, handling relative paths and ~ expansion + # Parse output_path into directory and basename components, handling relative paths and ~ expansion output_path = Path(self._config.output_path).expanduser() if not output_path.is_absolute(): output_path = self._working_dir / output_path From 650800bf5d85e5797323fb2e40d8576c74b3f5dc Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 06:46:37 +0000 Subject: [PATCH 40/54] fix: enforce absolute generator paths Point prompt vars and tests at output_path_abs and resolved working/script dirs --- tools/agentic_import/pvmap_generator.py | 33 +++++----- tools/agentic_import/pvmap_generator_test.py | 3 +- .../templates/generate_pvmap_prompt.j2 | 66 +++++++++---------- 3 files changed, 52 insertions(+), 50 deletions(-) diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py index 0edbd0f88b..eb1cdf5bdb 100644 --- a/tools/agentic_import/pvmap_generator.py +++ b/tools/agentic_import/pvmap_generator.py @@ -150,17 +150,18 @@ def __init__(self, config: Config): for path in self._config.data_config.input_metadata ] - # Parse output_path into directory and basename components, handling relative paths and ~ expansion + # Parse output_path into absolute path, handling relative paths and ~ expansion output_path = Path(self._config.output_path).expanduser() if not output_path.is_absolute(): output_path = self._working_dir / output_path + self._output_path_abs = output_path.resolve() - self._output_dir = output_path.parent - self._output_basename = output_path.name - self._config.output_path = str(output_path) + self._output_dir_abs = self._output_path_abs.parent + self._output_basename = self._output_path_abs.name + self._config.output_path = str(self._output_path_abs) # Create output directory if it doesn't exist - self._output_dir.mkdir(parents=True, exist_ok=True) + self._output_dir_abs.mkdir(parents=True, exist_ok=True) self._datacommons_dir = self._initialize_datacommons_dir() @@ -211,7 +212,7 @@ def _get_user_confirmation(self, prompt_file: Path) -> bool: print(f"Generated prompt: {prompt_file}") print(f"Working directory: {self._working_dir}") print(f"Output path: {self._config.output_path}") - print(f"Output directory: {self._output_dir}") + print(f"Output directory: {self._output_dir_abs}") print(f"Output basename: {self._output_basename}") print( f"Sandboxing: {'Enabled' if self._config.enable_sandboxing else 'Disabled'}" @@ -376,21 +377,21 @@ def _generate_prompt(self) -> Path: template = env.get_template('generate_pvmap_prompt.j2') # Calculate paths and prepare template variables - working_dir = str(self._working_dir) # Use defined working directory + working_dir = str(self._working_dir) # Absolute working directory # Point to tools/ directory (parent of agentic_import) - tools_dir = os.path.abspath(os.path.join(_SCRIPT_DIR, '..')) + tools_dir = os.path.abspath(os.path.join(_SCRIPT_DIR, '..')) # Absolute template_vars = { - 'working_dir': + 'working_dir_abs': working_dir, 'python_interpreter': sys.executable, - 'script_dir': + 'script_dir_abs': tools_dir, - 'input_data': + 'input_data_abs': str(self._config.data_config.input_data[0]) if self._config.data_config.input_data else "", - 'input_metadata': [ + 'input_metadata_abs': [ str(path) for path in self._config.data_config.input_metadata ] if self._config.data_config.input_metadata else [], 'dataset_type': @@ -400,10 +401,10 @@ def _generate_prompt(self) -> Path: 'gemini_run_id': self. _gemini_run_id, # Pass the gemini run ID for backup tracking - 'output_path': - self._config.output_path, # Full path for statvar processor - 'output_dir': - str(self._output_dir), # Directory for pvmap/metadata files + 'output_path_abs': + str(self._output_path_abs), # Absolute path prefix for outputs + 'output_dir_abs': + str(self._output_dir_abs), # Directory for pvmap/metadata files 'output_basename': self._output_basename # Base name for pvmap/metadata files } diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py index cad6f360ab..5323a684f8 100644 --- a/tools/agentic_import/pvmap_generator_test.py +++ b/tools/agentic_import/pvmap_generator_test.py @@ -253,7 +253,8 @@ def test_relative_paths_resolved_against_working_dir(self): str(data_file.resolve())) # Verify output directory is also under working_dir self.assertTrue( - str(generator._output_dir).startswith(str(work_path.resolve()))) + str(generator._output_dir_abs).startswith( + str(work_path.resolve()))) if __name__ == '__main__': diff --git a/tools/agentic_import/templates/generate_pvmap_prompt.j2 b/tools/agentic_import/templates/generate_pvmap_prompt.j2 index 99c6445d13..8fafc6ab47 100644 --- a/tools/agentic_import/templates/generate_pvmap_prompt.j2 +++ b/tools/agentic_import/templates/generate_pvmap_prompt.j2 @@ -1242,7 +1242,7 @@ When working with SDMX datasets, follow these additional guidelines: # CORE TASK Your primary goal is to analyze the provided CSV data and generate a complete -and valid `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv` files which can be used with Statvar +and valid `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv` files which can be used with Statvar processor tool to produce the final DataCommons artifacts. ## 📌 IMPORTANT: FILE NAMING CONVENTION @@ -1262,18 +1262,18 @@ This naming convention allows multiple datasets to be processed in the same work - ✅ No file conflicts or overwrites - ✅ Easy to organize outputs by topic or date -**Current Task**: For this specific run, your output path is `{{output_path}}`. +**Current Task**: For this specific run, your output path is `{{output_path_abs}}`. Throughout this documentation, you will see references to generic file names. **You MUST use the following specific file names for this task:** | Documentation Reference | Actual File You Must Create | |------------------------|----------------------------| -| `pvmap.csv` | `{{output_dir}}/{{output_basename}}_pvmap.csv` | -| `metadata.csv` | `{{output_dir}}/{{output_basename}}_metadata.csv` | -| `output.csv` | `{{output_path}}.csv` | +| `pvmap.csv` | `{{output_path_abs}}_pvmap.csv` | +| `metadata.csv` | `{{output_path_abs}}_metadata.csv` | +| `output.csv` | `{{output_path_abs}}.csv` | -**Example**: When the documentation says "create pvmap.csv", you must actually create `{{output_dir}}/{{output_basename}}_pvmap.csv` +**Example**: When the documentation says "create pvmap.csv", you must actually create `{{output_path_abs}}_pvmap.csv` **REMEMBER**: Whenever you see generic file names in the instructions, always use the specific names with the output path prefix. @@ -1343,11 +1343,11 @@ Follow these steps sequentially. {%- endif %} -**2. Generate `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv`** +**2. Generate `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv`** -- Create the `{{output_dir}}/{{output_basename}}_pvmap.csv` file, mapping the source data columns to DataCommons properties based on your findings. -- Create the `{{output_dir}}/{{output_basename}}_metadata.csv` file and define the necessary `statvar_processor` configuration parameters within it. -- Configuration rule: All processor flags/settings must live in `{{output_dir}}/{{output_basename}}_metadata.csv`. Do not embed configuration in `{{output_dir}}/{{output_basename}}_pvmap.csv` and do not rely on extra CLI flags. +- Create the `{{output_path_abs}}_pvmap.csv` file, mapping the source data columns to DataCommons properties based on your findings. +- Create the `{{output_path_abs}}_metadata.csv` file and define the necessary `statvar_processor` configuration parameters within it. +- Configuration rule: All processor flags/settings must live in `{{output_path_abs}}_metadata.csv`. Do not embed configuration in `{{output_path_abs}}_pvmap.csv` and do not rely on extra CLI flags. ### Validation Checklist While generating the files, ensure: @@ -1365,9 +1365,9 @@ While generating the files, ensure: - [ ] **Special/missing values mapped appropriately** - Use `#ignore` ONLY to drop entire rows. For skipping individual cell values, use empty mapping: `column:value,IntermediateProperty,''` (preserves row, skips cell) #### Metadata CSV Validation: -- [ ] **{{output_dir}}/{{output_basename}}_metadata.csv covers processor flags** - Includes required parameters (e.g., `header_rows`) -- [ ] **No config in {{output_dir}}/{{output_basename}}_pvmap.csv** - `{{output_dir}}/{{output_basename}}_pvmap.csv` contains only PV mappings, not processor settings -- [ ] **No extra CLI flags** - Configuration is exclusively in `{{output_dir}}/{{output_basename}}_metadata.csv`; wrapper provides input paths +- [ ] **{{output_path_abs}}_metadata.csv covers processor flags** - Includes required parameters (e.g., `header_rows`) +- [ ] **No config in {{output_path_abs}}_pvmap.csv** - `{{output_path_abs}}_pvmap.csv` contains only PV mappings, not processor settings +- [ ] **No extra CLI flags** - Configuration is exclusively in `{{output_path_abs}}_metadata.csv`; wrapper provides input paths - [ ] **Parameter names match documentation** - Not CLI flag names - [ ] **Quote values containing commas** - `key,"value1,value2,value3"` @@ -1394,13 +1394,13 @@ For SDMX datasets, also ensure: ```bash # Run statvar processor using dedicated script -{{script_dir}}/agentic_import/run_statvar_processor.sh \ +{{script_dir_abs}}/agentic_import/run_statvar_processor.sh \ --python "{{python_interpreter}}" \ - --script-dir "{{script_dir}}" \ - --working-dir "{{working_dir}}" \ - --input-data "{{input_data}}" \ + --script-dir "{{script_dir_abs}}" \ + --working-dir "{{working_dir_abs}}" \ + --input-data "{{input_data_abs}}" \ --gemini-run-id "{{gemini_run_id}}" \ - --output-path "{{output_path}}" + --output-path "{{output_path_abs}}" ``` The wrapper reads `metadata.csv` for all processor configuration. Do not add extra flags to this command. @@ -1409,11 +1409,11 @@ The wrapper reads `metadata.csv` for all processor configuration. Do not add ext **📊 VALIDATION CHECKLIST**: - Check the command exit code (0 = success, non-zero = failure) -- Verify that `{{working_dir}}/{{output_path}}.csv` exists and is not empty +- Verify that `{{output_path_abs}}.csv` exists and is not empty - Confirm no duplicate entries for same place, date, and variable - **Verify output.csv contains all required columns**: Must include at minimum `observationAbout`, `observationDate`, `variableMeasured`, `value` -- **Verify complete column mapping**: Any observation properties mapped in {{output_dir}}/{{output_basename}}_pvmap.csv (like `unit`, `scalingFactor`, `measurementMethod`, `observationPeriod`) must be present as columns in `{{working_dir}}/{{output_path}}.csv` -- **Verify `{{output_dir}}/{{output_basename}}_metadata.csv` completeness**: Confirm `header_rows` parameter is present and correctly specified +- **Verify complete column mapping**: Any observation properties mapped in {{output_path_abs}}_pvmap.csv (like `unit`, `scalingFactor`, `measurementMethod`, `observationPeriod`) must be present as columns in `{{output_path_abs}}.csv` +- **Verify `{{output_path_abs}}_metadata.csv` completeness**: Confirm `header_rows` parameter is present and correctly specified **🎯 DECISION LOGIC - APPLY THIS EXACTLY**: @@ -1428,22 +1428,22 @@ IF all items in the VALIDATION CHECKLIST above pass: ELIF CURRENT_ATTEMPT < {{max_iterations}}: → OUTPUT: "❌ ATTEMPT CURRENT_ATTEMPT FAILED - Error details: [describe specific error]" → OUTPUT: "🔄 Starting attempt [CURRENT_ATTEMPT + 1] of {{max_iterations}}..." - → Analyze the error from logs. In case statvar processor failed, read log file at: {{working_dir}}/.datacommons/processor.log + → Analyze the error from logs. In case statvar processor failed, read log file at: {{working_dir_abs}}/.datacommons/processor.log {# TODO: move debugging instructions to separate section #} - → **Common {{output_dir}}/{{output_basename}}_metadata.csv issues to check:** + → **Common {{output_path_abs}}_metadata.csv issues to check:** • Missing or wrong `header_rows` (should be 1 for standard CSV with headers) • Wrong `skip_rows` value skipping too much data • Debugging parameters left in production (`process_rows`, `input_rows`, `input_columns`) • Place resolution issues: missing `places_within` or wrong `place_type` - → Modify {{output_dir}}/{{output_basename}}_pvmap.csv and/or {{output_dir}}/{{output_basename}}_metadata.csv to fix identified issues + → Modify {{output_path_abs}}_pvmap.csv and/or {{output_path_abs}}_metadata.csv to fix identified issues → INCREMENT ATTEMPT COUNTER → Return to Step 5 (Run the Processor) ELSE (CURRENT_ATTEMPT >= {{max_iterations}}): → OUTPUT: "⛔ ITERATION LIMIT REACHED: Failed after {{max_iterations}} attempts" → OUTPUT: "📋 Final Status: FAILED - Manual intervention required" - → OUTPUT: "📁 Check logs at: {{working_dir}}/.datacommons/ for debugging" - → OUTPUT: "📁 Check backup at: {{working_dir}}/runs/{{gemini_run_id}}/ for debugging" + → OUTPUT: "📁 Check logs at: {{working_dir_abs}}/.datacommons/ for debugging" + → OUTPUT: "📁 Check backup at: {{working_dir_abs}}/runs/{{gemini_run_id}}/ for debugging" → STOP EXECUTION IMMEDIATELY → DO NOT MAKE ANY MORE ATTEMPTS ``` @@ -1462,21 +1462,21 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp ```json { - "input_data": ["{{input_data}}"], - "input_metadata": {{input_metadata | tojson}}, - "working_dir": "{{working_dir}}", - "output_dir": "{{working_dir}}/{{output_dir}}", + "input_data": ["{{input_data_abs}}"], + "input_metadata": {{input_metadata_abs | tojson}}, + "working_dir": "{{working_dir_abs}}", + "output_dir": "{{output_dir_abs}}", "dataset_type": "{{dataset_type}}" } ``` # OUTPUT REQUIREMENTS & FINAL INSTRUCTION -- Generate `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv` +- Generate `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv` - **Adhere to Rules:** Strictly follow all schema rules, property requirements, and formatting guidelines from the knowledge base. - DO NOT deviate from the documented standards. -- Configuration location: Place all processor flags/settings in `{{output_dir}}/{{output_basename}}_metadata.csv` only. Do not embed settings in `{{output_dir}}/{{output_basename}}_pvmap.csv` and do not propose additional CLI flags. +- Configuration location: Place all processor flags/settings in `{{output_path_abs}}_metadata.csv` only. Do not embed settings in `{{output_path_abs}}_pvmap.csv` and do not propose additional CLI flags. # 🛑 FINAL EXECUTION REMINDERS @@ -1491,7 +1491,7 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp # ACTION REQUIRED NOW -**Execute** the data analysis and generate the `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv` +**Execute** the data analysis and generate the `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv` files now. Follow the primary workflow **WITHOUT** deviation. **REMEMBER**: You have {{max_iterations}} attempts maximum. Track each attempt and stop when you succeed or reach the limit. From a97e212f058dedab7d5ae69a49f00201e4c52ae5 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 07:06:12 +0000 Subject: [PATCH 41/54] Refactor run/dry-run test helpers --- .../sdmx_import_pipeline_test.py | 206 +++++++----------- 1 file changed, 78 insertions(+), 128 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 73bad6e7ba..8501db8000 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -653,6 +653,34 @@ def test_hash_unchanged_skips_rerun(self) -> None: class SdmxStepTest(SdmxTestBase): + def _assert_run_and_dry_run_use_same_plan(self, + step, + *, + log_contains: str, + cmd_contains: str, + extra_cmd_checks=None, + expect_verbose: bool = True + ) -> None: + extra_cmd_checks = extra_cmd_checks or [] + with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + ) as mock_run_cmd: + with self.assertLogs(logging.get_absl_logger(), + level="INFO") as logs: + step.dry_run() + step.run() + + self.assertTrue( + any("test-step (dry run): would run" in entry + for entry in logs.output)) + self.assertTrue(any(log_contains in entry for entry in logs.output)) + mock_run_cmd.assert_called_once() + args, kwargs = mock_run_cmd.call_args + command = args[0] + self.assertTrue(any(cmd_contains in arg for arg in command)) + self.assertEqual(kwargs["verbose"], expect_verbose) + for check in extra_cmd_checks: + check(command) + def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: with self.assertLogs(logging.get_absl_logger(), @@ -705,26 +733,11 @@ def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: ), ) step = DownloadMetadataStep(name="test-step", config=config) - - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" - ) as mock_run_cmd: - with self.assertLogs(logging.get_absl_logger(), - level="INFO") as logs: - step.dry_run() - step.run() - - # Verify dry_run logged the command - self.assertTrue( - any("test-step (dry run): would run" in entry - for entry in logs.output)) - self.assertTrue( - any("download-metadata" in entry for entry in logs.output)) - - # Verify run called the command with the same args - mock_run_cmd.assert_called_once() - args, kwargs = mock_run_cmd.call_args - self.assertIn("download-metadata", args[0]) - self.assertTrue(kwargs["verbose"]) + self._assert_run_and_dry_run_use_same_plan( + step, + log_contains="download-metadata", + cmd_contains="download-metadata", + ) def test_download_data_step_caches_plan(self) -> None: config = PipelineConfig( @@ -773,26 +786,11 @@ def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: ), ) step = DownloadDataStep(name="test-step", config=config) - - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" - ) as mock_run_cmd: - with self.assertLogs(logging.get_absl_logger(), - level="INFO") as logs: - step.dry_run() - step.run() - - # Verify dry_run logged the command - self.assertTrue( - any("test-step (dry run): would run" in entry - for entry in logs.output)) - self.assertTrue( - any("download-data" in entry for entry in logs.output)) - - # Verify run called the command with the same args - mock_run_cmd.assert_called_once() - args, kwargs = mock_run_cmd.call_args - self.assertIn("download-data", args[0]) - self.assertTrue(kwargs["verbose"]) + self._assert_run_and_dry_run_use_same_plan( + step, + log_contains="download-data", + cmd_contains="download-data", + ) def test_create_sample_step_caches_plan(self) -> None: config = PipelineConfig( @@ -832,28 +830,11 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: # Create test input file for run() input_path = Path(self._tmpdir) / "demo_data.csv" input_path.write_text("header\nrow1") - - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" - ) as mock_run_cmd: - with self.assertLogs(logging.get_absl_logger(), - level="INFO") as logs: - step.dry_run() - step.run() - - # Verify dry_run logged the command - self.assertTrue( - any("test-step (dry run): would run" in entry - for entry in logs.output)) - self.assertTrue( - any("data_sampler.py" in entry for entry in logs.output)) - - # Verify run called the command with the same args - mock_run_cmd.assert_called_once() - args, kwargs = mock_run_cmd.call_args - self.assertTrue(any("data_sampler.py" in arg for arg in args[0])) - self.assertTrue(kwargs["verbose"]) - - self.assertTrue(kwargs["verbose"]) + self._assert_run_and_dry_run_use_same_plan( + step, + log_contains="data_sampler.py", + cmd_contains="data_sampler.py", + ) def test_create_sample_step_dry_run_succeeds_if_input_missing(self) -> None: config = PipelineConfig( @@ -924,28 +905,11 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: # Create test input files for run() (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1") (Path(self._tmpdir) / "demo_metadata.xml").write_text("") - - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" - ) as mock_run_cmd: - with self.assertLogs(logging.get_absl_logger(), - level="INFO") as logs: - step.dry_run() - step.run() - - # Verify dry_run logged the command - self.assertTrue( - any("test-step (dry run): would run" in entry - for entry in logs.output)) - self.assertTrue( - any("pvmap_generator.py" in entry for entry in logs.output)) - - # Verify run called the command with the same args - mock_run_cmd.assert_called_once() - args, kwargs = mock_run_cmd.call_args - self.assertTrue(any("pvmap_generator.py" in arg for arg in args[0])) - self.assertTrue(kwargs["verbose"]) - - self.assertTrue(kwargs["verbose"]) + self._assert_run_and_dry_run_use_same_plan( + step, + log_contains="pvmap_generator.py", + cmd_contains="pvmap_generator.py", + ) def test_create_schema_map_step_dry_run_succeeds_if_input_missing( self) -> None: @@ -1001,29 +965,15 @@ def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None: # Create test files self._create_test_input_files("demo") - - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" - ) as mock_run_cmd: - with self.assertLogs(logging.get_absl_logger(), - level="INFO") as logs: - step.dry_run() - step.run() - - # Verify dry_run logged the command - self.assertTrue( - any("test-step (dry run): would run" in entry - for entry in logs.output)) - self.assertTrue( - any("stat_var_processor.py" in entry for entry in logs.output)) - - # Verify run called the command with the same args - mock_run_cmd.assert_called_once() - args, kwargs = mock_run_cmd.call_args - self.assertTrue( - any("stat_var_processor.py" in arg for arg in args[0])) - self.assertTrue( - any(arg.startswith("--input_data=") for arg in args[0])) - self.assertTrue(kwargs["verbose"]) + self._assert_run_and_dry_run_use_same_plan( + step, + log_contains="stat_var_processor.py", + cmd_contains="stat_var_processor.py", + extra_cmd_checks=[ + lambda command: self.assertTrue( + any(arg.startswith("--input_data=") for arg in command)), + ], + ) def test_process_full_data_step_dry_run_succeeds_if_input_missing( self) -> None: @@ -1074,27 +1024,27 @@ def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None: final_output_dir = Path(self._tmpdir) / "output" final_output_dir.mkdir(parents=True, exist_ok=True) (final_output_dir / "demo.csv").write_text("data") - - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" - ) as mock_run_cmd: - step.run() - mock_run_cmd.assert_called_once() - args, kwargs = mock_run_cmd.call_args - command = args[0] - self.assertTrue( - any("generate_custom_dc_config.py" in arg for arg in command)) - self.assertIn(f"--input_csv={final_output_dir}/demo.csv", command) - self.assertIn( - f"--output_config={final_output_dir}/demo_config.json", command) - self.assertIn("--provenance_name=FLOW", command) - self.assertIn("--source_name=AGENCY", command) - self.assertIn("--data_source_url=https://example.com", command) - self.assertIn("--dataset_url=https://example.com/data/AGENCY,FLOW,", - command) - - with self.assertLogs(logging.get_absl_logger(), level="INFO") as cm: - step.dry_run() - self.assertTrue(any("would run" in msg for msg in cm.output)) + self._assert_run_and_dry_run_use_same_plan( + step, + log_contains="generate_custom_dc_config.py", + cmd_contains="generate_custom_dc_config.py", + extra_cmd_checks=[ + lambda command: self.assertIn( + f"--input_csv={final_output_dir}/demo.csv", command), + lambda command: self.assertIn( + f"--output_config={final_output_dir}/demo_config.json", + command), + lambda command: self.assertIn("--provenance_name=FLOW", + command), + lambda command: self.assertIn("--source_name=AGENCY", command), + lambda command: self.assertIn( + "--data_source_url=https://example.com", command), + lambda command: self.assertIn( + "--dataset_url=https://example.com/data/AGENCY,FLOW,", + command), + ], + expect_verbose=False, + ) if __name__ == "__main__": From b98bbac8c60ebda99c124407160d1123ffc34350 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 07:09:48 +0000 Subject: [PATCH 42/54] Refactor cache plan tests --- .../sdmx_import_pipeline_test.py | 126 +++++++++--------- 1 file changed, 62 insertions(+), 64 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 8501db8000..5b070886b2 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -681,6 +681,26 @@ def _assert_run_and_dry_run_use_same_plan(self, for check in extra_cmd_checks: check(command) + def _assert_step_caches_plan(self, + step, + *, + command_contains=None, + path_attrs=None) -> None: + command_contains = command_contains or [] + path_attrs = path_attrs or [] + + context1 = step._prepare_command() + context2 = step._prepare_command() + self.assertIs(context1, context2) + + for attr in path_attrs: + self.assertTrue(getattr(context1, attr).is_absolute()) + + if command_contains: + for expected in command_contains: + self.assertTrue( + any(expected in arg for arg in context1.full_command)) + def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: with self.assertLogs(logging.get_absl_logger(), @@ -707,16 +727,11 @@ def test_download_metadata_step_caches_plan(self) -> None: ), ) step = DownloadMetadataStep(name="test-step", config=config) - - # First call creates context - context1 = step._prepare_command() - self.assertIn("download-metadata", context1.full_command) - self.assertIn("--endpoint=https://example.com", context1.full_command) - - # Second call returns same object - context2 = step._prepare_command() - self.assertIs(context1, context2) - self.assertTrue(context1.output_path.is_absolute()) + self._assert_step_caches_plan( + step, + command_contains=["download-metadata", "--endpoint=https://example.com"], + path_attrs=["output_path"], + ) def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -758,18 +773,16 @@ def test_download_data_step_caches_plan(self) -> None: ), ) step = DownloadDataStep(name="test-step", config=config) - - # First call creates context - context1 = step._prepare_command() - self.assertIn("download-data", context1.full_command) - self.assertIn("--endpoint=https://example.com", context1.full_command) - self.assertIn("--key=test-key", context1.full_command) - self.assertIn("--param=area=US", context1.full_command) - - # Second call returns same object - context2 = step._prepare_command() - self.assertIs(context1, context2) - self.assertTrue(context1.output_path.is_absolute()) + self._assert_step_caches_plan( + step, + command_contains=[ + "download-data", + "--endpoint=https://example.com", + "--key=test-key", + "--param=area=US", + ], + path_attrs=["output_path"], + ) def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -803,17 +816,11 @@ def test_create_sample_step_caches_plan(self) -> None: sample=SampleConfig(rows=500), ) step = CreateSampleStep(name="test-step", config=config) - - # No input file created, dry run should still succeed - context1 = step._prepare_command() - self.assertTrue( - any("data_sampler.py" in arg for arg in context1.full_command)) - self.assertIn("--sampler_output_rows=500", context1.full_command) - - # Second call returns same object - context2 = step._prepare_command() - self.assertIs(context1, context2) - self.assertTrue(context1.output_path.is_absolute()) + self._assert_step_caches_plan( + step, + command_contains=["data_sampler.py", "--sampler_output_rows=500"], + path_attrs=["output_path"], + ) def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig( @@ -876,22 +883,15 @@ def test_create_schema_map_step_caches_plan(self) -> None: skip_confirmation=True, ),) step = CreateSchemaMapStep(name="test-step", config=config) - - # No input files created, dry run should still succeed - - # First call creates context - context1 = step._prepare_command() - self.assertTrue( - any("pvmap_generator.py" in arg for arg in context1.full_command)) - self.assertIn("--gemini_cli=custom-gemini", context1.full_command) - self.assertIn("--skip_confirmation", context1.full_command) - - # Second call returns same object - context2 = step._prepare_command() - self.assertIs(context1, context2) - self.assertTrue(context1.sample_path.is_absolute()) - self.assertTrue(context1.metadata_path.is_absolute()) - self.assertTrue(context1.output_prefix.is_absolute()) + self._assert_step_caches_plan( + step, + command_contains=[ + "pvmap_generator.py", + "--gemini_cli=custom-gemini", + "--skip_confirmation", + ], + path_attrs=["sample_path", "metadata_path", "output_prefix"], + ) def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -943,16 +943,15 @@ def test_process_full_data_step_caches_plan(self) -> None: verbose=True, ),) step = ProcessFullDataStep(name="test-step", config=config) - - # No input files created, dry run should still succeed - - context1 = step._prepare_command() - context2 = step._prepare_command() - self.assertIs(context1, context2) - self.assertTrue(context1.input_data_path.is_absolute()) - self.assertTrue(context1.pv_map_path.is_absolute()) - self.assertTrue(context1.metadata_path.is_absolute()) - self.assertTrue(context1.output_prefix.is_absolute()) + self._assert_step_caches_plan( + step, + path_attrs=[ + "input_data_path", + "pv_map_path", + "metadata_path", + "output_prefix", + ], + ) def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -1005,11 +1004,10 @@ def test_create_dc_config_step_caches_plan(self) -> None: agency="AGENCY", dataflow="FLOW") step = CreateDcConfigStep(name="test-step", config=config) - context1 = step._prepare_command() - context2 = step._prepare_command() - self.assertIs(context1, context2) - self.assertTrue(context1.input_csv.is_absolute()) - self.assertTrue(context1.output_config.is_absolute()) + self._assert_step_caches_plan( + step, + path_attrs=["input_csv", "output_config"], + ) def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None: config = self._build_config(dataset_prefix="demo", From c178bfa077688cd885e40cafb765b255490838bf Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 07:12:09 +0000 Subject: [PATCH 43/54] Refactor missing-input test helpers --- .../sdmx_import_pipeline_test.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 5b070886b2..4a20fbb37b 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -701,6 +701,13 @@ def _assert_step_caches_plan(self, self.assertTrue( any(expected in arg for arg in context1.full_command)) + def _assert_dry_run_succeeds_without_input(self, step) -> None: + step.dry_run() + + def _assert_run_fails_without_input(self, step, error_pattern: str) -> None: + with self.assertRaisesRegex(RuntimeError, error_pattern): + step.run() + def test_run_command_logs_and_executes(self) -> None: with mock.patch("subprocess.run") as mock_run: with self.assertLogs(logging.get_absl_logger(), @@ -855,7 +862,7 @@ def test_create_sample_step_dry_run_succeeds_if_input_missing(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) # No input file created, dry run should still succeed - step.dry_run() + self._assert_dry_run_succeeds_without_input(step) def test_create_sample_step_run_fails_if_input_missing(self) -> None: config = PipelineConfig( @@ -869,9 +876,8 @@ def test_create_sample_step_run_fails_if_input_missing(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) # No input file created, run should fail - with self.assertRaisesRegex(RuntimeError, - "Input file missing for sampling"): - step.run() + self._assert_run_fails_without_input( + step, "Input file missing for sampling") def test_create_schema_map_step_caches_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -921,7 +927,7 @@ def test_create_schema_map_step_dry_run_succeeds_if_input_missing( ),) step = CreateSchemaMapStep(name="test-step", config=config) # No input files created, dry run should still succeed - step.dry_run() + self._assert_dry_run_succeeds_without_input(step) def test_create_schema_map_step_run_fails_if_input_missing(self) -> None: config = PipelineConfig(run=RunConfig( @@ -932,8 +938,7 @@ def test_create_schema_map_step_run_fails_if_input_missing(self) -> None: ),) step = CreateSchemaMapStep(name="test-step", config=config) # No input files created, run should fail - with self.assertRaises(RuntimeError): - step.run() + self._assert_run_fails_without_input(step, ".*") def test_process_full_data_step_caches_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -984,7 +989,7 @@ def test_process_full_data_step_dry_run_succeeds_if_input_missing( ),) step = ProcessFullDataStep(name="test-step", config=config) # Missing input files, dry run should still succeed - step.dry_run() + self._assert_dry_run_succeeds_without_input(step) def test_process_full_data_step_run_fails_if_input_missing(self) -> None: config = PipelineConfig(run=RunConfig( @@ -995,8 +1000,7 @@ def test_process_full_data_step_run_fails_if_input_missing(self) -> None: ),) step = ProcessFullDataStep(name="test-step", config=config) # Missing input files, run should fail - with self.assertRaises(RuntimeError): - step.run() + self._assert_run_fails_without_input(step, ".*") def test_create_dc_config_step_caches_plan(self) -> None: config = self._build_config(dataset_prefix="demo", From ef361a9f36efc86c7b436408e51a1512b5374842 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 07:14:10 +0000 Subject: [PATCH 44/54] lint fix --- .../sdmx_import_pipeline_test.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 4a20fbb37b..0340b8b227 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -653,14 +653,14 @@ def test_hash_unchanged_skips_rerun(self) -> None: class SdmxStepTest(SdmxTestBase): - def _assert_run_and_dry_run_use_same_plan(self, - step, - *, - log_contains: str, - cmd_contains: str, - extra_cmd_checks=None, - expect_verbose: bool = True - ) -> None: + def _assert_run_and_dry_run_use_same_plan( + self, + step, + *, + log_contains: str, + cmd_contains: str, + extra_cmd_checks=None, + expect_verbose: bool = True) -> None: extra_cmd_checks = extra_cmd_checks or [] with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" ) as mock_run_cmd: @@ -736,7 +736,9 @@ def test_download_metadata_step_caches_plan(self) -> None: step = DownloadMetadataStep(name="test-step", config=config) self._assert_step_caches_plan( step, - command_contains=["download-metadata", "--endpoint=https://example.com"], + command_contains=[ + "download-metadata", "--endpoint=https://example.com" + ], path_attrs=["output_path"], ) @@ -876,8 +878,8 @@ def test_create_sample_step_run_fails_if_input_missing(self) -> None: ) step = CreateSampleStep(name="test-step", config=config) # No input file created, run should fail - self._assert_run_fails_without_input( - step, "Input file missing for sampling") + self._assert_run_fails_without_input(step, + "Input file missing for sampling") def test_create_schema_map_step_caches_plan(self) -> None: config = PipelineConfig(run=RunConfig( @@ -1036,8 +1038,8 @@ def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None: lambda command: self.assertIn( f"--output_config={final_output_dir}/demo_config.json", command), - lambda command: self.assertIn("--provenance_name=FLOW", - command), + lambda command: self.assertIn("--provenance_name=FLOW", command + ), lambda command: self.assertIn("--source_name=AGENCY", command), lambda command: self.assertIn( "--data_source_url=https://example.com", command), From 6a93ffcd16c111f3e9795f14eaae8b95dd741641 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 07:17:31 +0000 Subject: [PATCH 45/54] Fix absolute output path handling --- tools/agentic_import/run_statvar_processor.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/agentic_import/run_statvar_processor.sh b/tools/agentic_import/run_statvar_processor.sh index d907e4f724..2c2cd83fca 100755 --- a/tools/agentic_import/run_statvar_processor.sh +++ b/tools/agentic_import/run_statvar_processor.sh @@ -67,6 +67,13 @@ if [[ -z "$PYTHON_INTERPRETER" || -z "$SCRIPT_DIR" || -z "$WORKING_DIR" || -z "$ exit 1 fi +# Normalize output prefix: respect absolute paths, otherwise anchor under working dir. +if [[ "${OUTPUT_PATH}" = /* ]]; then + OUTPUT_PREFIX="${OUTPUT_PATH}" +else + OUTPUT_PREFIX="${WORKING_DIR}/${OUTPUT_PATH}" +fi + # Create .datacommons directory if it doesn't exist mkdir -p "${WORKING_DIR}/.datacommons" @@ -81,20 +88,19 @@ OUTPUT_COLUMNS="observationDate,observationAbout,variableMeasured,value,observat echo "Running statvar processor..." "${PYTHON_INTERPRETER}" "${SCRIPT_DIR}/statvar_importer/stat_var_processor.py" \ --input_data="${INPUT_DATA}" \ - --pv_map="${WORKING_DIR}/${OUTPUT_PATH}_pvmap.csv" \ - --config_file="${WORKING_DIR}/${OUTPUT_PATH}_metadata.csv" \ + --pv_map="${OUTPUT_PREFIX}_pvmap.csv" \ + --config_file="${OUTPUT_PREFIX}_metadata.csv" \ --generate_statvar_name=True \ --skip_constant_csv_columns=False \ --output_columns="${OUTPUT_COLUMNS}" \ --output_counters="${WORKING_DIR}/.datacommons/output_counters.csv" \ - --output_path="${WORKING_DIR}/${OUTPUT_PATH}" > "${PROCESSOR_LOG}" 2>&1 + --output_path="${OUTPUT_PREFIX}" > "${PROCESSOR_LOG}" 2>&1 # Capture the processor exit code PROCESSOR_EXIT_CODE=${PIPESTATUS[0]} # Run backup script silently (redirect output to backup log) echo "Backing up run data..." -OUTPUT_PREFIX="${WORKING_DIR}/${OUTPUT_PATH}" declare -a BACKUP_ARGS=( "--working_dir=${WORKING_DIR}" "--gemini_run_id=${GEMINI_RUN_ID}" From fc8d555af1b906505cc8c116399d3bc6e3840510 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 07:21:58 +0000 Subject: [PATCH 46/54] Add abs path backup test --- tools/agentic_import/backup_processor_run_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/agentic_import/backup_processor_run_test.py b/tools/agentic_import/backup_processor_run_test.py index 3681b3e7ae..027f1f81f3 100644 --- a/tools/agentic_import/backup_processor_run_test.py +++ b/tools/agentic_import/backup_processor_run_test.py @@ -45,6 +45,17 @@ def _read_manifest(self, backup_dir: Path) -> str: with open(manifest_path, 'r') as manifest_file: return manifest_file.read() + def test_absolute_path_copied(self): + absolute_file = self.working_dir / 'abs.txt' + absolute_file.write_text('absolute') + + backup_dir = self._run_backup([str(absolute_file)]) + + self.assertTrue((backup_dir / 'abs.txt').exists()) + manifest = self._read_manifest(backup_dir) + self.assertIn(str(absolute_file), manifest) + self.assertNotIn('Skipped (missing or blocked):', manifest) + def test_copies_requested_files(self): first = self.working_dir / 'a.txt' second = self.working_dir / 'b.txt' From 7b33667c82fb20b7d16c29117cabd28613ac1d2c Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 08:27:12 +0000 Subject: [PATCH 47/54] minor fix in tests --- tools/agentic_import/state_handler_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py index c000260f6d..dfcdd416dc 100644 --- a/tools/agentic_import/state_handler_test.py +++ b/tools/agentic_import/state_handler_test.py @@ -47,7 +47,7 @@ def test_missing_file_creates_empty_state(self) -> None: data = json.load(fp) self.assertEqual(data["dataset_prefix"], "demo") self.assertEqual(data["steps"], {}) - self.assertIsNone(data["updated_at_ts"]) + self.assertIsNone(data.get("updated_at_ts")) def test_corrupt_file_creates_backup_and_resets_state(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: From cbec5c1e1c9cb50ad55a444324cb9a6a8fb49de8 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 08:33:34 +0000 Subject: [PATCH 48/54] refactor --- tools/agentic_import/sdmx_import_pipeline.py | 86 ++++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 3c933f5d5a..99ff2cf000 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -64,6 +64,49 @@ FLAGS = flags.FLAGS +def _define_flags() -> None: + flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") + flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT) + + flags.DEFINE_string(_FLAG_SDMX_AGENCY, None, + "Owning SDMX agency identifier.") + flags.mark_flag_as_required(_FLAG_SDMX_AGENCY) + + flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None, + "Target SDMX dataflow identifier.") + flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID) + + flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None, + "Optional SDMX key or filter.") + + flags.DEFINE_string( + _FLAG_SDMX_DATAFLOW_PARAM, None, + "Optional SDMX parameter appended to the dataflow query.") + + flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000, + "Number of rows to sample from downloaded data.") + + flags.DEFINE_string( + "dataset_prefix", None, + "Optional dataset prefix to override auto-derived values.") + + flags.DEFINE_string("run_only", None, + "Execute only a specific pipeline step by name.") + + flags.DEFINE_boolean("force", False, "Force all steps to run.") + + flags.DEFINE_boolean("verbose", False, "Enable verbose logging.") + + flags.DEFINE_boolean("skip_confirmation", False, + "Skip interactive confirmation prompts.") + + flags.DEFINE_string("gemini_cli", "gemini", + "Path to Gemini CLI executable.") + + flags.DEFINE_string("working_dir", None, + "Working directory for the pipeline.") + + @dataclass(frozen=True) class SdmxDataflowConfig: """Configuration for SDMX dataflow.""" @@ -949,49 +992,6 @@ def prepare_config() -> PipelineConfig: ) -def _define_flags() -> None: - flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") - flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT) - - flags.DEFINE_string(_FLAG_SDMX_AGENCY, None, - "Owning SDMX agency identifier.") - flags.mark_flag_as_required(_FLAG_SDMX_AGENCY) - - flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None, - "Target SDMX dataflow identifier.") - flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID) - - flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None, - "Optional SDMX key or filter.") - - flags.DEFINE_string( - _FLAG_SDMX_DATAFLOW_PARAM, None, - "Optional SDMX parameter appended to the dataflow query.") - - flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000, - "Number of rows to sample from downloaded data.") - - flags.DEFINE_string( - "dataset_prefix", None, - "Optional dataset prefix to override auto-derived values.") - - flags.DEFINE_string("run_only", None, - "Execute only a specific pipeline step by name.") - - flags.DEFINE_boolean("force", False, "Force all steps to run.") - - flags.DEFINE_boolean("verbose", False, "Enable verbose logging.") - - flags.DEFINE_boolean("skip_confirmation", False, - "Skip interactive confirmation prompts.") - - flags.DEFINE_string("gemini_cli", "gemini", - "Path to Gemini CLI executable.") - - flags.DEFINE_string("working_dir", None, - "Working directory for the pipeline.") - - def main(_: list[str]) -> int: config = prepare_config() logging.info(f"SDMX pipeline configuration: {config}") From 1f74094babce263c52085a1aaaa0c2deda10da61 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 08:48:01 +0000 Subject: [PATCH 49/54] refactor: Remove `get_steps` method and directly access `pipeline.steps` attribute. --- tools/agentic_import/pipeline.py | 5 +---- tools/agentic_import/sdmx_import_pipeline_test.py | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index 09b22a94dd..43d009c3b6 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -67,9 +67,6 @@ def version(self) -> int: class Pipeline: steps: Sequence[Step] - def get_steps(self) -> list[Step]: - return list(self.steps) - class PipelineCallback: """Lifecycle hooks consumed by the runner; defaults are no-ops.""" @@ -112,7 +109,7 @@ def run(self, pipeline: Pipeline, callback: PipelineCallback | None = None) -> None: current_step: Step | None = None - steps = pipeline.get_steps() + steps = pipeline.steps logging.info(f"Starting pipeline with {len(steps)} steps") try: for step in steps: diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 0340b8b227..6f971faecb 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -341,7 +341,7 @@ def _names_from_builder(self, steps=builder_steps) result = builder.build() pipeline = result.pipeline - return [step.name for step in pipeline.get_steps()] + return [step.name for step in pipeline.steps] def test_run_only_step(self) -> None: cfg_step = PipelineConfig( @@ -432,7 +432,7 @@ def test_version_bump_schedules_downstream(self) -> None: self.assertEqual(names, ["process-full-data", "create-dc-config"]) pipeline = build_sdmx_pipeline(config=cfg, state=state, steps=steps) - self.assertEqual([s.name for s in pipeline.get_steps()], + self.assertEqual([s.name for s in pipeline.steps], ["process-full-data", "create-dc-config"]) def test_incremental_records_skip_reasons(self) -> None: @@ -448,7 +448,7 @@ def test_incremental_records_skip_reasons(self) -> None: steps = build_steps(cfg) builder = PipelineBuilder(config=cfg, state=state, steps=steps) result = builder.build() - self.assertFalse(result.pipeline.get_steps()) + self.assertFalse(result.pipeline.steps) self.assertEqual(len(result.decisions), len(steps)) for decision in result.decisions: self.assertEqual(decision.decision, StepDecision.SKIP) From 31c00f86a9f223f2b9a38aeecb4757e587c43bb3 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Thu, 27 Nov 2025 09:00:07 +0000 Subject: [PATCH 50/54] feat: Introduce structured SDMX agentic import pipeline with dedicated configuration and step implementations. --- tools/agentic_import/sdmx_import_pipeline.py | 696 +----------------- .../sdmx_import_pipeline_test.py | 18 +- tools/agentic_import/sdmx_pipeline_builder.py | 231 ++++++ tools/agentic_import/sdmx_pipeline_config.py | 61 ++ tools/agentic_import/sdmx_pipeline_steps.py | 467 ++++++++++++ 5 files changed, 781 insertions(+), 692 deletions(-) create mode 100644 tools/agentic_import/sdmx_pipeline_builder.py create mode 100644 tools/agentic_import/sdmx_pipeline_config.py create mode 100644 tools/agentic_import/sdmx_pipeline_steps.py diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index 99ff2cf000..a26e6799f0 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -15,20 +15,17 @@ from __future__ import annotations -import abc import copy import hashlib import json import os import re import shlex -import subprocess import sys import dataclasses -from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path -from typing import Callable, ClassVar, Sequence +from typing import Callable, Sequence from absl import app, flags, logging @@ -36,22 +33,16 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from tools.agentic_import.pipeline import (CompositeCallback, Pipeline, - PipelineAbort, PipelineCallback, - PipelineRunner, RunnerConfig, Step) -from tools.agentic_import.state_handler import (PipelineState, StateHandler, - StepState) - -SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" -DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py" -STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" / - "stat_var_processor.py") -PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py" -DC_CONFIG_GENERATOR_PATH = (REPO_ROOT / "tools" / "agentic_import" / - "generate_custom_dc_config.py") - -SAMPLE_OUTPUT_DIR = Path("sample_output") -FINAL_OUTPUT_DIR = Path("output") +from tools.agentic_import.pipeline import (CompositeCallback, PipelineAbort, + PipelineCallback, PipelineRunner, + RunnerConfig, Step) +from tools.agentic_import.sdmx_pipeline_builder import build_sdmx_pipeline +from tools.agentic_import.sdmx_pipeline_config import (PipelineConfig, + RunConfig, SampleConfig, + SdmxConfig, + SdmxDataflowConfig) +from tools.agentic_import.sdmx_pipeline_steps import SdmxStep +from tools.agentic_import.state_handler import StateHandler, StepState # Flag names _FLAG_SDMX_ENDPOINT = "sdmx.endpoint" @@ -107,86 +98,6 @@ def _define_flags() -> None: "Working directory for the pipeline.") -@dataclass(frozen=True) -class SdmxDataflowConfig: - """Configuration for SDMX dataflow.""" - id: str | None = None - key: str | None = None - param: str | None = None - - -@dataclass(frozen=True) -class SdmxConfig: - """Configuration for SDMX data access.""" - endpoint: str | None = None - agency: str | None = None - dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig) - - -@dataclass(frozen=True) -class SampleConfig: - """Configuration for data sampling.""" - rows: int = 1000 - - -@dataclass(frozen=True) -class RunConfig: - """Configuration for pipeline execution.""" - command: str - dataset_prefix: str | None = None - working_dir: str | None = None - run_only: str | None = None - force: bool = False - verbose: bool = False - skip_confirmation: bool = False - gemini_cli: str | None = None - - -@dataclass(frozen=True) -class PipelineConfig: - """Aggregated configuration for the pipeline.""" - sdmx: SdmxConfig = field(default_factory=SdmxConfig) - sample: SampleConfig = field(default_factory=SampleConfig) - run: RunConfig = field(default_factory=lambda: RunConfig(command="python")) - - -@dataclass(frozen=True) -class StepDecision: - """Represents whether a step will run and why.""" - - RUN: ClassVar[str] = "RUN" - SKIP: ClassVar[str] = "SKIP" - - step_name: str - decision: str - reason: str - - -@dataclass(frozen=True) -class BuildResult: - """Output of planning that includes the pipeline and per-step decisions.""" - - pipeline: Pipeline - decisions: list[StepDecision] - - -def _require_config_field(value: str | None, field: str, step_name: str) -> str: - if value: - return value - raise ValueError(f"{step_name} requires config.{field}") - - -def _run_command(command: Sequence[str], *, verbose: bool) -> None: - if verbose: - logging.debug(f"Running command: {' '.join(command)}") - subprocess.run(command, check=True) - - -def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: - command = [sys.executable, str(SDMX_CLI_PATH), *args] - _run_command(command, verbose=verbose) - - def _format_time(value: datetime) -> str: if value.tzinfo is None: value = value.replace(tzinfo=timezone.utc) @@ -343,591 +254,6 @@ def build_pipeline_callback( return CompositeCallback([interactive, json_callback]) -class SdmxStep(Step): - """Base class for SDMX steps that carries immutable config and version.""" - - def __init__(self, *, name: str, version: int, - config: PipelineConfig) -> None: - if not name: - raise ValueError("step requires a name") - self._name = name - self._version = version - self._config = config - - @property - def name(self) -> str: - return self._name - - @property - def version(self) -> int: - return self._version - - @abc.abstractmethod - def dry_run(self) -> None: - """Log a read-only preview of the work to be done.""" - - -class DownloadDataStep(SdmxStep): - """Downloads SDMX data payloads.""" - - VERSION = 1 - - @dataclass(frozen=True) - class _StepContext: - full_command: list[str] - output_path: Path - - def __init__(self, *, name: str, config: PipelineConfig) -> None: - super().__init__(name=name, version=self.VERSION, config=config) - self._context: DownloadDataStep._StepContext | None = None - - def _prepare_command(self) -> _StepContext: - if self._context: - return self._context - endpoint = _require_config_field(self._config.sdmx.endpoint, - _FLAG_SDMX_ENDPOINT, self.name) - agency = _require_config_field(self._config.sdmx.agency, - _FLAG_SDMX_AGENCY, self.name) - dataflow = _require_config_field(self._config.sdmx.dataflow.id, - _FLAG_SDMX_DATAFLOW_ID, self.name) - dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir).resolve() - output_path = working_dir / f"{dataset_prefix}_data.csv" - args = [ - "download-data", - f"--endpoint={endpoint}", - f"--agency={agency}", - f"--dataflow={dataflow}", - f"--output_path={output_path}", - ] - if self._config.sdmx.dataflow.key: - args.append(f"--key={self._config.sdmx.dataflow.key}") - if self._config.sdmx.dataflow.param: - args.append(f"--param={self._config.sdmx.dataflow.param}") - if self._config.run.verbose: - args.append("--verbose") - full_command = [sys.executable, str(SDMX_CLI_PATH)] + args - self._context = DownloadDataStep._StepContext(full_command=full_command, - output_path=output_path) - return self._context - - def run(self) -> None: - context = self._prepare_command() - if self._config.run.verbose: - logging.info( - f"Starting SDMX data download: {' '.join(context.full_command)} -> {context.output_path}" - ) - else: - logging.info(f"Downloading SDMX data to {context.output_path}") - _run_command(context.full_command, verbose=self._config.run.verbose) - - def dry_run(self) -> None: - context = self._prepare_command() - logging.info( - f"{self.name} (dry run): would run {' '.join(context.full_command)}" - ) - - -class DownloadMetadataStep(SdmxStep): - """Downloads SDMX metadata payloads.""" - - VERSION = 1 - - @dataclass(frozen=True) - class _StepContext: - full_command: list[str] - output_path: Path - - def __init__(self, *, name: str, config: PipelineConfig) -> None: - super().__init__(name=name, version=self.VERSION, config=config) - self._context: DownloadMetadataStep._StepContext | None = None - - def _prepare_command(self) -> _StepContext: - if self._context: - return self._context - endpoint = _require_config_field(self._config.sdmx.endpoint, - _FLAG_SDMX_ENDPOINT, self.name) - agency = _require_config_field(self._config.sdmx.agency, - _FLAG_SDMX_AGENCY, self.name) - dataflow = _require_config_field(self._config.sdmx.dataflow.id, - _FLAG_SDMX_DATAFLOW_ID, self.name) - dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir).resolve() - output_path = working_dir / f"{dataset_prefix}_metadata.xml" - args = [ - "download-metadata", - f"--endpoint={endpoint}", - f"--agency={agency}", - f"--dataflow={dataflow}", - f"--output_path={output_path}", - ] - if self._config.run.verbose: - args.append("--verbose") - full_command = [sys.executable, str(SDMX_CLI_PATH)] + args - self._context = DownloadMetadataStep._StepContext( - full_command=full_command, output_path=output_path) - return self._context - - def run(self) -> None: - context = self._prepare_command() - if self._config.run.verbose: - logging.info( - f"Starting SDMX metadata download: {' '.join(context.full_command)} -> {context.output_path}" - ) - else: - logging.info(f"Downloading SDMX metadata to {context.output_path}") - _run_command(context.full_command, verbose=self._config.run.verbose) - - def dry_run(self) -> None: - context = self._prepare_command() - logging.info( - f"{self.name} (dry run): would run {' '.join(context.full_command)}" - ) - - -class CreateSampleStep(SdmxStep): - """Creates a sample dataset from downloaded data.""" - - VERSION = 1 - - @dataclass(frozen=True) - class _StepContext: - input_path: Path - full_command: list[str] - output_path: Path - - def __init__(self, *, name: str, config: PipelineConfig) -> None: - super().__init__(name=name, version=self.VERSION, config=config) - self._context: CreateSampleStep._StepContext | None = None - - def _prepare_command(self) -> _StepContext: - if self._context: - return self._context - dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir).resolve() - input_path = working_dir / f"{dataset_prefix}_data.csv" - output_path = working_dir / f"{dataset_prefix}_sample.csv" - - args = [ - f"--sampler_input={input_path}", - f"--sampler_output={output_path}", - f"--sampler_output_rows={self._config.sample.rows}", - ] - full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args - self._context = CreateSampleStep._StepContext(input_path=input_path, - full_command=full_command, - output_path=output_path) - return self._context - - def run(self) -> None: - context = self._prepare_command() - if not context.input_path.is_file(): - raise RuntimeError( - f"Input file missing for sampling: {context.input_path}") - if self._config.run.verbose: - logging.info( - f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}" - ) - else: - logging.info(f"Sampling data to {context.output_path}") - _run_command(context.full_command, verbose=self._config.run.verbose) - - def dry_run(self) -> None: - context = self._prepare_command() - logging.info( - f"{self.name} (dry run): would run {' '.join(context.full_command)}" - ) - - -class CreateSchemaMapStep(SdmxStep): - """Builds schema mappings for transformed data.""" - - VERSION = 1 - - @dataclass(frozen=True) - class _StepContext: - sample_path: Path - metadata_path: Path - output_prefix: Path - full_command: list[str] - - def __init__(self, *, name: str, config: PipelineConfig) -> None: - super().__init__(name=name, version=self.VERSION, config=config) - self._context: CreateSchemaMapStep._StepContext | None = None - - def _prepare_command(self) -> _StepContext: - if self._context: - return self._context - dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir).resolve() - sample_path = working_dir / f"{dataset_prefix}_sample.csv" - metadata_path = working_dir / f"{dataset_prefix}_metadata.xml" - output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix - - args = [ - f"--input_data={sample_path}", - f"--input_metadata={metadata_path}", - "--sdmx_dataset", - f"--output_path={output_prefix}", - ] - if self._config.run.skip_confirmation: - args.append("--skip_confirmation") - if self._config.run.gemini_cli: - args.append(f"--gemini_cli={self._config.run.gemini_cli}") - args.append(f"--working_dir={working_dir}") - - full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args - self._context = CreateSchemaMapStep._StepContext( - sample_path=sample_path, - metadata_path=metadata_path, - output_prefix=output_prefix, - full_command=full_command) - return self._context - - def run(self) -> None: - context = self._prepare_command() - if not context.sample_path.is_file(): - raise RuntimeError(f"Sample file missing: {context.sample_path}") - if not context.metadata_path.is_file(): - raise RuntimeError( - f"Metadata file missing: {context.metadata_path}") - context.output_prefix.parent.mkdir(parents=True, exist_ok=True) - logging.info( - f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}" - ) - _run_command(context.full_command, verbose=self._config.run.verbose) - - def dry_run(self) -> None: - context = self._prepare_command() - logging.info( - f"{self.name} (dry run): would run {' '.join(context.full_command)}" - ) - - -class ProcessFullDataStep(SdmxStep): - """Processes full SDMX data into DC artifacts.""" - - VERSION = 1 - - RUN_OUTPUT_COLUMNS: ClassVar[str] = ( - "observationDate,observationAbout,variableMeasured,value," - "observationPeriod,measurementMethod,unit,scalingFactor") - - @dataclass(frozen=True) - class _StepContext: - input_data_path: Path - pv_map_path: Path - metadata_path: Path - full_command: list[str] - output_prefix: Path - - def __init__(self, *, name: str, config: PipelineConfig) -> None: - super().__init__(name=name, version=self.VERSION, config=config) - self._context: ProcessFullDataStep._StepContext | None = None - - def _prepare_command(self) -> _StepContext: - if self._context: - return self._context - dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir).resolve() - input_data_path = working_dir / f"{dataset_prefix}_data.csv" - pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR / - f"{dataset_prefix}_pvmap.csv") - metadata_path = (working_dir / SAMPLE_OUTPUT_DIR / - f"{dataset_prefix}_metadata.csv") - output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix - - args = [ - f"--input_data={input_data_path}", - f"--pv_map={pv_map_path}", - f"--config_file={metadata_path}", - "--generate_statvar_name=True", - "--skip_constant_csv_columns=False", - f"--output_columns={self.RUN_OUTPUT_COLUMNS}", - f"--output_path={output_prefix}", - ] - full_command = [sys.executable, str(STAT_VAR_PROCESSOR_PATH)] + args - self._context = ProcessFullDataStep._StepContext( - input_data_path=input_data_path, - pv_map_path=pv_map_path, - metadata_path=metadata_path, - full_command=full_command, - output_prefix=output_prefix, - ) - return self._context - - def run(self) -> None: - context = self._prepare_command() - for required in (context.input_data_path, context.pv_map_path, - context.metadata_path): - if not required.is_file(): - raise RuntimeError( - f"{self.name} requires existing input: {required}") - # Ensure output directory exists - context.output_prefix.parent.mkdir(parents=True, exist_ok=True) - logging.info( - f"Starting stat_var_processor: input={context.input_data_path} " - f"pvmap={context.pv_map_path} metadata={context.metadata_path} -> " - f"{context.output_prefix}") - _run_command(context.full_command, verbose=self._config.run.verbose) - - def dry_run(self) -> None: - context = self._prepare_command() - logging.info( - f"{self.name} (dry run): would run {' '.join(context.full_command)}" - ) - - -class CreateDcConfigStep(SdmxStep): - """Generates Datacommons configuration artifacts.""" - - VERSION = 1 - - @dataclass(frozen=True) - class _StepContext: - input_csv: Path - output_config: Path - full_command: list[str] - - def __init__(self, *, name: str, config: PipelineConfig) -> None: - super().__init__(name=name, version=self.VERSION, config=config) - self._context: CreateDcConfigStep._StepContext | None = None - - def _prepare_command(self) -> _StepContext: - if self._context: - return self._context - dataset_prefix = self._config.run.dataset_prefix - working_dir = Path(self._config.run.working_dir).resolve() - input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv" - output_config = (working_dir / FINAL_OUTPUT_DIR / - f"{dataset_prefix}_config.json") - - endpoint = _require_config_field(self._config.sdmx.endpoint, - _FLAG_SDMX_ENDPOINT, self.name) - agency = _require_config_field(self._config.sdmx.agency, - _FLAG_SDMX_AGENCY, self.name) - dataflow = _require_config_field(self._config.sdmx.dataflow.id, - _FLAG_SDMX_DATAFLOW_ID, self.name) - - dataset_url = (f"{endpoint.rstrip('/')}/data/" - f"{agency},{dataflow},") - - args = [ - f"--input_csv={input_csv}", - f"--output_config={output_config}", - f"--provenance_name={dataflow}", - f"--source_name={agency}", - f"--data_source_url={endpoint}", - f"--dataset_url={dataset_url}", - ] - full_command = [sys.executable, str(DC_CONFIG_GENERATOR_PATH)] + args - self._context = CreateDcConfigStep._StepContext( - input_csv=input_csv, - output_config=output_config, - full_command=full_command) - return self._context - - def run(self) -> None: - context = self._prepare_command() - if not context.input_csv.is_file(): - raise RuntimeError( - f"{self.name} requires existing input: {context.input_csv}") - - logging.info( - f"Starting custom DC config generation: input={context.input_csv} -> {context.output_config}" - ) - _run_command(context.full_command, verbose=self._config.run.verbose) - - def dry_run(self) -> None: - context = self._prepare_command() - logging.info( - f"{self.name} (dry run): would run {' '.join(context.full_command)}" - ) - - -class PipelineBuilder: - - def __init__(self, - *, - config: PipelineConfig, - state: PipelineState, - steps: Sequence[Step], - critical_input_hash: str | None = None) -> None: - self._config = config - self._state = state - self._steps = steps - self._critical_input_hash = critical_input_hash - - def build(self) -> BuildResult: - if self._config.run.run_only: - planned, decisions = self._plan_run_only(self._config.run.run_only) - elif self._config.run.force: - logging.info("Force flag set; scheduling all SDMX steps") - planned, decisions = self._plan_all_steps( - "Force flag set; scheduling this step") - elif self._hash_changed(): - logging.info("Critical inputs changed; scheduling all SDMX steps") - planned, decisions = self._plan_all_steps( - "Critical inputs changed; scheduling this step") - else: - planned, decisions = self._plan_incremental() - logging.info("Built SDMX pipeline with %d steps", len(planned)) - return BuildResult(pipeline=Pipeline(steps=planned), - decisions=decisions) - - def _plan_run_only(self, - run_only: str) -> tuple[list[Step], list[StepDecision]]: - planned: list[Step] = [] - decisions: list[StepDecision] = [] - for step in self._steps: - if step.name == run_only: - planned.append(step) - decisions.append( - StepDecision( - step_name=step.name, - decision=StepDecision.RUN, - reason=(f"run_only={run_only} requested; running only " - "this step"), - )) - else: - decisions.append( - StepDecision( - step_name=step.name, - decision=StepDecision.SKIP, - reason=(f"run_only={run_only} requested; skipping " - "this step"), - )) - if not planned: - raise ValueError(f"run_only step not found: {run_only}") - return planned, decisions - - def _plan_all_steps(self, - reason: str) -> tuple[list[Step], list[StepDecision]]: - planned: list[Step] = [] - decisions: list[StepDecision] = [] - for step in self._steps: - planned.append(step) - decisions.append( - StepDecision(step_name=step.name, - decision=StepDecision.RUN, - reason=reason)) - return planned, decisions - - def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]: - planned: list[Step] = [] - decisions: list[StepDecision] = [] - schedule_all_remaining = False - previous: Step | None = None - for step in self._steps: - if schedule_all_remaining: - planned.append(step) - decisions.append( - StepDecision( - step_name=step.name, - decision=StepDecision.RUN, - reason=("Upstream step triggered rerun for remaining " - "steps"), - )) - previous = step - continue - - prev_state = self._state.steps.get(step.name) - if prev_state is None: - needs_run = True - reason = "No previous state recorded; scheduling step" - elif prev_state.status != "succeeded": - needs_run = True - reason = (f"Previous run status was {prev_state.status}; " - "rerunning step") - elif prev_state.version < step.version: - needs_run = True - reason = ( - f"Step version increased from {prev_state.version} to " - f"{step.version}; rerunning step") - else: - needs_run = False - reason = ("Previous run succeeded with same version; step is " - "up-to-date") - - if not needs_run and previous is not None: - if self._predecessor_newer(previous, step): - needs_run = True - reason = (f"Previous step {previous.name} finished more " - "recently; rerunning downstream steps") - - if needs_run: - planned.append(step) - decisions.append( - StepDecision(step_name=step.name, - decision=StepDecision.RUN, - reason=reason)) - schedule_all_remaining = True - else: - decisions.append( - StepDecision(step_name=step.name, - decision=StepDecision.SKIP, - reason=reason)) - previous = step - - if not planned: - logging.info("No steps scheduled.") - return planned, decisions - - def _hash_changed(self) -> bool: - if not self._critical_input_hash: - return False - previous = self._state.critical_input_hash - if not previous: - return True - return previous != self._critical_input_hash - - def _predecessor_newer(self, prev_step: Step, step: Step) -> bool: - prev_state = self._state.steps.get(prev_step.name) - curr_state = self._state.steps.get(step.name) - if prev_state is None or prev_state.ended_at_ts is None: - return False - if curr_state is None: - return True - if curr_state.status != "succeeded": - return True - if curr_state.ended_at_ts is None: - return True - return prev_state.ended_at_ts > curr_state.ended_at_ts - - -def build_steps(config: PipelineConfig) -> list[Step]: - """Constructs the hard-coded list of canonical steps.""" - return [ - DownloadDataStep(name="download-data", config=config), - DownloadMetadataStep(name="download-metadata", config=config), - CreateSampleStep(name="create-sample", config=config), - CreateSchemaMapStep(name="create-schema-mapping", config=config), - ProcessFullDataStep(name="process-full-data", config=config), - CreateDcConfigStep(name="create-dc-config", config=config), - ] - - -def _log_step_decisions(decisions: Sequence[StepDecision]) -> None: - for decision in decisions: - logging.info("step=%s decision=%s reason=%s", decision.step_name, - decision.decision, decision.reason) - - -def build_sdmx_pipeline(*, - config: PipelineConfig, - state: PipelineState, - steps: Sequence[Step] | None = None, - critical_input_hash: str | None = None) -> Pipeline: - builder_steps = steps if steps is not None else build_steps(config) - builder = PipelineBuilder(config=config, - state=state, - steps=builder_steps, - critical_input_hash=critical_input_hash) - result = builder.build() - _log_step_decisions(result.decisions) - return result.pipeline - - def run_sdmx_pipeline( *, config: PipelineConfig, diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 6f971faecb..88b86b5721 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -44,11 +44,15 @@ RunnerConfig, ) from tools.agentic_import.sdmx_import_pipeline import ( # pylint: disable=import-error - InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig, - StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps, - run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep, - CreateSchemaMapStep, ProcessFullDataStep, CreateDcConfigStep, _run_command, - SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep) + InteractiveCallback, JSONStateCallback, build_pipeline_callback, + run_sdmx_pipeline) +from tools.agentic_import.sdmx_pipeline_builder import ( # pylint: disable=import-error + PipelineBuilder, StepDecision, build_sdmx_pipeline, build_steps) +from tools.agentic_import.sdmx_pipeline_config import ( # pylint: disable=import-error + PipelineConfig, RunConfig, SampleConfig, SdmxConfig, SdmxDataflowConfig) +from tools.agentic_import.sdmx_pipeline_steps import ( # pylint: disable=import-error + CreateDcConfigStep, CreateSampleStep, CreateSchemaMapStep, DownloadDataStep, + DownloadMetadataStep, ProcessFullDataStep, SdmxStep, _run_command) from tools.agentic_import.state_handler import ( # pylint: disable=import-error PipelineState, StateHandler, StepState) @@ -507,7 +511,7 @@ def setUp(self) -> None: super().setUp() # Mock _run_command to avoid actual execution during pipeline tests self._run_command_patcher = mock.patch( - "tools.agentic_import.sdmx_import_pipeline._run_command") + "tools.agentic_import.sdmx_pipeline_steps._run_command") self._mock_run_command = self._run_command_patcher.start() self.addCleanup(self._run_command_patcher.stop) @@ -662,7 +666,7 @@ def _assert_run_and_dry_run_use_same_plan( extra_cmd_checks=None, expect_verbose: bool = True) -> None: extra_cmd_checks = extra_cmd_checks or [] - with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command" + with mock.patch("tools.agentic_import.sdmx_pipeline_steps._run_command" ) as mock_run_cmd: with self.assertLogs(logging.get_absl_logger(), level="INFO") as logs: diff --git a/tools/agentic_import/sdmx_pipeline_builder.py b/tools/agentic_import/sdmx_pipeline_builder.py new file mode 100644 index 0000000000..8d8c01ee64 --- /dev/null +++ b/tools/agentic_import/sdmx_pipeline_builder.py @@ -0,0 +1,231 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Builder for the SDMX agentic import pipeline.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import ClassVar, Sequence + +from absl import logging + +from tools.agentic_import.pipeline import Pipeline, Step +from tools.agentic_import.sdmx_pipeline_config import PipelineConfig +from tools.agentic_import.sdmx_pipeline_steps import ( + CreateDcConfigStep, CreateSampleStep, CreateSchemaMapStep, DownloadDataStep, + DownloadMetadataStep, ProcessFullDataStep) +from tools.agentic_import.state_handler import PipelineState + + +@dataclass(frozen=True) +class StepDecision: + """Represents whether a step will run and why.""" + + RUN: ClassVar[str] = "RUN" + SKIP: ClassVar[str] = "SKIP" + + step_name: str + decision: str + reason: str + + +@dataclass(frozen=True) +class BuildResult: + """Output of planning that includes the pipeline and per-step decisions.""" + + pipeline: Pipeline + decisions: list[StepDecision] + + +class PipelineBuilder: + + def __init__(self, + *, + config: PipelineConfig, + state: PipelineState, + steps: Sequence[Step], + critical_input_hash: str | None = None) -> None: + self._config = config + self._state = state + self._steps = steps + self._critical_input_hash = critical_input_hash + + def build(self) -> BuildResult: + if self._config.run.run_only: + planned, decisions = self._plan_run_only(self._config.run.run_only) + elif self._config.run.force: + logging.info("Force flag set; scheduling all SDMX steps") + planned, decisions = self._plan_all_steps( + "Force flag set; scheduling this step") + elif self._hash_changed(): + logging.info("Critical inputs changed; scheduling all SDMX steps") + planned, decisions = self._plan_all_steps( + "Critical inputs changed; scheduling this step") + else: + planned, decisions = self._plan_incremental() + logging.info("Built SDMX pipeline with %d steps", len(planned)) + return BuildResult(pipeline=Pipeline(steps=planned), + decisions=decisions) + + def _plan_run_only(self, + run_only: str) -> tuple[list[Step], list[StepDecision]]: + planned: list[Step] = [] + decisions: list[StepDecision] = [] + for step in self._steps: + if step.name == run_only: + planned.append(step) + decisions.append( + StepDecision( + step_name=step.name, + decision=StepDecision.RUN, + reason=(f"run_only={run_only} requested; running only " + "this step"), + )) + else: + decisions.append( + StepDecision( + step_name=step.name, + decision=StepDecision.SKIP, + reason=(f"run_only={run_only} requested; skipping " + "this step"), + )) + if not planned: + raise ValueError(f"run_only step not found: {run_only}") + return planned, decisions + + def _plan_all_steps(self, + reason: str) -> tuple[list[Step], list[StepDecision]]: + planned: list[Step] = [] + decisions: list[StepDecision] = [] + for step in self._steps: + planned.append(step) + decisions.append( + StepDecision(step_name=step.name, + decision=StepDecision.RUN, + reason=reason)) + return planned, decisions + + def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]: + planned: list[Step] = [] + decisions: list[StepDecision] = [] + schedule_all_remaining = False + previous: Step | None = None + for step in self._steps: + if schedule_all_remaining: + planned.append(step) + decisions.append( + StepDecision( + step_name=step.name, + decision=StepDecision.RUN, + reason=("Upstream step triggered rerun for remaining " + "steps"), + )) + previous = step + continue + + prev_state = self._state.steps.get(step.name) + if prev_state is None: + needs_run = True + reason = "No previous state recorded; scheduling step" + elif prev_state.status != "succeeded": + needs_run = True + reason = (f"Previous run status was {prev_state.status}; " + "rerunning step") + elif prev_state.version < step.version: + needs_run = True + reason = ( + f"Step version increased from {prev_state.version} to " + f"{step.version}; rerunning step") + else: + needs_run = False + reason = ("Previous run succeeded with same version; step is " + "up-to-date") + + if not needs_run and previous is not None: + if self._predecessor_newer(previous, step): + needs_run = True + reason = (f"Previous step {previous.name} finished more " + "recently; rerunning downstream steps") + + if needs_run: + planned.append(step) + decisions.append( + StepDecision(step_name=step.name, + decision=StepDecision.RUN, + reason=reason)) + schedule_all_remaining = True + else: + decisions.append( + StepDecision(step_name=step.name, + decision=StepDecision.SKIP, + reason=reason)) + previous = step + + if not planned: + logging.info("No steps scheduled.") + return planned, decisions + + def _hash_changed(self) -> bool: + if not self._critical_input_hash: + return False + previous = self._state.critical_input_hash + if not previous: + return True + return previous != self._critical_input_hash + + def _predecessor_newer(self, prev_step: Step, step: Step) -> bool: + prev_state = self._state.steps.get(prev_step.name) + curr_state = self._state.steps.get(step.name) + if prev_state is None or prev_state.ended_at_ts is None: + return False + if curr_state is None: + return True + if curr_state.status != "succeeded": + return True + if curr_state.ended_at_ts is None: + return True + return prev_state.ended_at_ts > curr_state.ended_at_ts + + +def build_steps(config: PipelineConfig) -> list[Step]: + """Constructs the hard-coded list of canonical steps.""" + return [ + DownloadDataStep(name="download-data", config=config), + DownloadMetadataStep(name="download-metadata", config=config), + CreateSampleStep(name="create-sample", config=config), + CreateSchemaMapStep(name="create-schema-mapping", config=config), + ProcessFullDataStep(name="process-full-data", config=config), + CreateDcConfigStep(name="create-dc-config", config=config), + ] + + +def _log_step_decisions(decisions: Sequence[StepDecision]) -> None: + for decision in decisions: + logging.info("step=%s decision=%s reason=%s", decision.step_name, + decision.decision, decision.reason) + + +def build_sdmx_pipeline(*, + config: PipelineConfig, + state: PipelineState, + steps: Sequence[Step] | None = None, + critical_input_hash: str | None = None) -> Pipeline: + builder_steps = steps if steps is not None else build_steps(config) + builder = PipelineBuilder(config=config, + state=state, + steps=builder_steps, + critical_input_hash=critical_input_hash) + result = builder.build() + _log_step_decisions(result.decisions) + return result.pipeline diff --git a/tools/agentic_import/sdmx_pipeline_config.py b/tools/agentic_import/sdmx_pipeline_config.py new file mode 100644 index 0000000000..dd9683518b --- /dev/null +++ b/tools/agentic_import/sdmx_pipeline_config.py @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Configuration dataclasses for the SDMX agentic import pipeline.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(frozen=True) +class SdmxDataflowConfig: + """Configuration for SDMX dataflow.""" + id: str | None = None + key: str | None = None + param: str | None = None + + +@dataclass(frozen=True) +class SdmxConfig: + """Configuration for SDMX data access.""" + endpoint: str | None = None + agency: str | None = None + dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig) + + +@dataclass(frozen=True) +class SampleConfig: + """Configuration for data sampling.""" + rows: int = 1000 + + +@dataclass(frozen=True) +class RunConfig: + """Configuration for pipeline execution.""" + command: str + dataset_prefix: str | None = None + working_dir: str | None = None + run_only: str | None = None + force: bool = False + verbose: bool = False + skip_confirmation: bool = False + gemini_cli: str | None = None + + +@dataclass(frozen=True) +class PipelineConfig: + """Aggregated configuration for the pipeline.""" + sdmx: SdmxConfig = field(default_factory=SdmxConfig) + sample: SampleConfig = field(default_factory=SampleConfig) + run: RunConfig = field(default_factory=lambda: RunConfig(command="python")) diff --git a/tools/agentic_import/sdmx_pipeline_steps.py b/tools/agentic_import/sdmx_pipeline_steps.py new file mode 100644 index 0000000000..7d25165b66 --- /dev/null +++ b/tools/agentic_import/sdmx_pipeline_steps.py @@ -0,0 +1,467 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Step implementations for the SDMX agentic import pipeline.""" + +from __future__ import annotations + +import abc +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import ClassVar, Sequence + +from absl import logging + +from tools.agentic_import.pipeline import Step +from tools.agentic_import.sdmx_pipeline_config import PipelineConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + +SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py" +DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py" +STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" / + "stat_var_processor.py") +PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py" +DC_CONFIG_GENERATOR_PATH = (REPO_ROOT / "tools" / "agentic_import" / + "generate_custom_dc_config.py") + +SAMPLE_OUTPUT_DIR = Path("sample_output") +FINAL_OUTPUT_DIR = Path("output") + +# Flag names (copied for reference/usage in steps if needed, +# though they are mostly used in main for flag definition) +_FLAG_SDMX_ENDPOINT = "sdmx.endpoint" +_FLAG_SDMX_AGENCY = "sdmx.agency" +_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id" + + +def _require_config_field(value: str | None, field_name: str, + step_name: str) -> str: + if value: + return value + raise ValueError(f"{step_name} requires config.{field_name}") + + +def _run_command(command: Sequence[str], *, verbose: bool) -> None: + if verbose: + logging.debug(f"Running command: {' '.join(command)}") + subprocess.run(command, check=True) + + +def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: + command = [sys.executable, str(SDMX_CLI_PATH), *args] + _run_command(command, verbose=verbose) + + +class SdmxStep(Step): + """Base class for SDMX steps that carries immutable config and version.""" + + def __init__(self, *, name: str, version: int, + config: PipelineConfig) -> None: + if not name: + raise ValueError("step requires a name") + self._name = name + self._version = version + self._config = config + + @property + def name(self) -> str: + return self._name + + @property + def version(self) -> int: + return self._version + + @abc.abstractmethod + def dry_run(self) -> None: + """Log a read-only preview of the work to be done.""" + + +class DownloadDataStep(SdmxStep): + """Downloads SDMX data payloads.""" + + VERSION = 1 + + @dataclass(frozen=True) + class _StepContext: + full_command: list[str] + output_path: Path + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + self._context: DownloadDataStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + endpoint = _require_config_field(self._config.sdmx.endpoint, + _FLAG_SDMX_ENDPOINT, self.name) + agency = _require_config_field(self._config.sdmx.agency, + _FLAG_SDMX_AGENCY, self.name) + dataflow = _require_config_field(self._config.sdmx.dataflow.id, + _FLAG_SDMX_DATAFLOW_ID, self.name) + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir).resolve() + output_path = working_dir / f"{dataset_prefix}_data.csv" + args = [ + "download-data", + f"--endpoint={endpoint}", + f"--agency={agency}", + f"--dataflow={dataflow}", + f"--output_path={output_path}", + ] + if self._config.sdmx.dataflow.key: + args.append(f"--key={self._config.sdmx.dataflow.key}") + if self._config.sdmx.dataflow.param: + args.append(f"--param={self._config.sdmx.dataflow.param}") + if self._config.run.verbose: + args.append("--verbose") + full_command = [sys.executable, str(SDMX_CLI_PATH)] + args + self._context = DownloadDataStep._StepContext(full_command=full_command, + output_path=output_path) + return self._context + + def run(self) -> None: + context = self._prepare_command() + if self._config.run.verbose: + logging.info( + f"Starting SDMX data download: {' '.join(context.full_command)} -> {context.output_path}" + ) + else: + logging.info(f"Downloading SDMX data to {context.output_path}") + _run_command(context.full_command, verbose=self._config.run.verbose) + + def dry_run(self) -> None: + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) + + +class DownloadMetadataStep(SdmxStep): + """Downloads SDMX metadata payloads.""" + + VERSION = 1 + + @dataclass(frozen=True) + class _StepContext: + full_command: list[str] + output_path: Path + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + self._context: DownloadMetadataStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + endpoint = _require_config_field(self._config.sdmx.endpoint, + _FLAG_SDMX_ENDPOINT, self.name) + agency = _require_config_field(self._config.sdmx.agency, + _FLAG_SDMX_AGENCY, self.name) + dataflow = _require_config_field(self._config.sdmx.dataflow.id, + _FLAG_SDMX_DATAFLOW_ID, self.name) + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir).resolve() + output_path = working_dir / f"{dataset_prefix}_metadata.xml" + args = [ + "download-metadata", + f"--endpoint={endpoint}", + f"--agency={agency}", + f"--dataflow={dataflow}", + f"--output_path={output_path}", + ] + if self._config.run.verbose: + args.append("--verbose") + full_command = [sys.executable, str(SDMX_CLI_PATH)] + args + self._context = DownloadMetadataStep._StepContext( + full_command=full_command, output_path=output_path) + return self._context + + def run(self) -> None: + context = self._prepare_command() + if self._config.run.verbose: + logging.info( + f"Starting SDMX metadata download: {' '.join(context.full_command)} -> {context.output_path}" + ) + else: + logging.info(f"Downloading SDMX metadata to {context.output_path}") + _run_command(context.full_command, verbose=self._config.run.verbose) + + def dry_run(self) -> None: + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) + + +class CreateSampleStep(SdmxStep): + """Creates a sample dataset from downloaded data.""" + + VERSION = 1 + + @dataclass(frozen=True) + class _StepContext: + input_path: Path + full_command: list[str] + output_path: Path + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + self._context: CreateSampleStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir).resolve() + input_path = working_dir / f"{dataset_prefix}_data.csv" + output_path = working_dir / f"{dataset_prefix}_sample.csv" + + args = [ + f"--sampler_input={input_path}", + f"--sampler_output={output_path}", + f"--sampler_output_rows={self._config.sample.rows}", + ] + full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args + self._context = CreateSampleStep._StepContext(input_path=input_path, + full_command=full_command, + output_path=output_path) + return self._context + + def run(self) -> None: + context = self._prepare_command() + if not context.input_path.is_file(): + raise RuntimeError( + f"Input file missing for sampling: {context.input_path}") + if self._config.run.verbose: + logging.info( + f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}" + ) + else: + logging.info(f"Sampling data to {context.output_path}") + _run_command(context.full_command, verbose=self._config.run.verbose) + + def dry_run(self) -> None: + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) + + +class CreateSchemaMapStep(SdmxStep): + """Builds schema mappings for transformed data.""" + + VERSION = 1 + + @dataclass(frozen=True) + class _StepContext: + sample_path: Path + metadata_path: Path + output_prefix: Path + full_command: list[str] + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + self._context: CreateSchemaMapStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir).resolve() + sample_path = working_dir / f"{dataset_prefix}_sample.csv" + metadata_path = working_dir / f"{dataset_prefix}_metadata.xml" + output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix + + args = [ + f"--input_data={sample_path}", + f"--input_metadata={metadata_path}", + "--sdmx_dataset", + f"--output_path={output_prefix}", + ] + if self._config.run.skip_confirmation: + args.append("--skip_confirmation") + if self._config.run.gemini_cli: + args.append(f"--gemini_cli={self._config.run.gemini_cli}") + args.append(f"--working_dir={working_dir}") + + full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args + self._context = CreateSchemaMapStep._StepContext( + sample_path=sample_path, + metadata_path=metadata_path, + output_prefix=output_prefix, + full_command=full_command) + return self._context + + def run(self) -> None: + context = self._prepare_command() + if not context.sample_path.is_file(): + raise RuntimeError(f"Sample file missing: {context.sample_path}") + if not context.metadata_path.is_file(): + raise RuntimeError( + f"Metadata file missing: {context.metadata_path}") + context.output_prefix.parent.mkdir(parents=True, exist_ok=True) + logging.info( + f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}" + ) + _run_command(context.full_command, verbose=self._config.run.verbose) + + def dry_run(self) -> None: + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) + + +class ProcessFullDataStep(SdmxStep): + """Processes full SDMX data into DC artifacts.""" + + VERSION = 1 + + RUN_OUTPUT_COLUMNS: ClassVar[str] = ( + "observationDate,observationAbout,variableMeasured,value," + "observationPeriod,measurementMethod,unit,scalingFactor") + + @dataclass(frozen=True) + class _StepContext: + input_data_path: Path + pv_map_path: Path + metadata_path: Path + full_command: list[str] + output_prefix: Path + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + self._context: ProcessFullDataStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir).resolve() + input_data_path = working_dir / f"{dataset_prefix}_data.csv" + pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR / + f"{dataset_prefix}_pvmap.csv") + metadata_path = (working_dir / SAMPLE_OUTPUT_DIR / + f"{dataset_prefix}_metadata.csv") + output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix + + args = [ + f"--input_data={input_data_path}", + f"--pv_map={pv_map_path}", + f"--config_file={metadata_path}", + "--generate_statvar_name=True", + "--skip_constant_csv_columns=False", + f"--output_columns={self.RUN_OUTPUT_COLUMNS}", + f"--output_path={output_prefix}", + ] + full_command = [sys.executable, str(STAT_VAR_PROCESSOR_PATH)] + args + self._context = ProcessFullDataStep._StepContext( + input_data_path=input_data_path, + pv_map_path=pv_map_path, + metadata_path=metadata_path, + full_command=full_command, + output_prefix=output_prefix, + ) + return self._context + + def run(self) -> None: + context = self._prepare_command() + for required in (context.input_data_path, context.pv_map_path, + context.metadata_path): + if not required.is_file(): + raise RuntimeError( + f"{self.name} requires existing input: {required}") + # Ensure output directory exists + context.output_prefix.parent.mkdir(parents=True, exist_ok=True) + logging.info( + f"Starting stat_var_processor: input={context.input_data_path} " + f"pvmap={context.pv_map_path} metadata={context.metadata_path} -> " + f"{context.output_prefix}") + _run_command(context.full_command, verbose=self._config.run.verbose) + + def dry_run(self) -> None: + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) + + +class CreateDcConfigStep(SdmxStep): + """Generates Datacommons configuration artifacts.""" + + VERSION = 1 + + @dataclass(frozen=True) + class _StepContext: + input_csv: Path + output_config: Path + full_command: list[str] + + def __init__(self, *, name: str, config: PipelineConfig) -> None: + super().__init__(name=name, version=self.VERSION, config=config) + self._context: CreateDcConfigStep._StepContext | None = None + + def _prepare_command(self) -> _StepContext: + if self._context: + return self._context + dataset_prefix = self._config.run.dataset_prefix + working_dir = Path(self._config.run.working_dir).resolve() + input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv" + output_config = (working_dir / FINAL_OUTPUT_DIR / + f"{dataset_prefix}_config.json") + + endpoint = _require_config_field(self._config.sdmx.endpoint, + _FLAG_SDMX_ENDPOINT, self.name) + agency = _require_config_field(self._config.sdmx.agency, + _FLAG_SDMX_AGENCY, self.name) + dataflow = _require_config_field(self._config.sdmx.dataflow.id, + _FLAG_SDMX_DATAFLOW_ID, self.name) + + dataset_url = (f"{endpoint.rstrip('/')}/data/" + f"{agency},{dataflow},") + + args = [ + f"--input_csv={input_csv}", + f"--output_config={output_config}", + f"--provenance_name={dataflow}", + f"--source_name={agency}", + f"--data_source_url={endpoint}", + f"--dataset_url={dataset_url}", + ] + full_command = [sys.executable, str(DC_CONFIG_GENERATOR_PATH)] + args + self._context = CreateDcConfigStep._StepContext( + input_csv=input_csv, + output_config=output_config, + full_command=full_command) + return self._context + + def run(self) -> None: + context = self._prepare_command() + if not context.input_csv.is_file(): + raise RuntimeError( + f"{self.name} requires existing input: {context.input_csv}") + + logging.info( + f"Starting custom DC config generation: input={context.input_csv} -> {context.output_config}" + ) + _run_command(context.full_command, verbose=self._config.run.verbose) + + def dry_run(self) -> None: + context = self._prepare_command() + logging.info( + f"{self.name} (dry run): would run {' '.join(context.full_command)}" + ) From b10c1be46117145c105d2f8b524288cfc1cd9646 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Thu, 27 Nov 2025 09:18:20 +0000 Subject: [PATCH 51/54] refactor: centralize SDMX flag constants --- tools/agentic_import/sdmx_import_pipeline.py | 57 ++++++++++---------- tools/agentic_import/sdmx_pipeline_config.py | 7 +++ tools/agentic_import/sdmx_pipeline_steps.py | 36 ++++++------- 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py index a26e6799f0..32957e6c6d 100644 --- a/tools/agentic_import/sdmx_import_pipeline.py +++ b/tools/agentic_import/sdmx_import_pipeline.py @@ -37,41 +37,44 @@ PipelineCallback, PipelineRunner, RunnerConfig, Step) from tools.agentic_import.sdmx_pipeline_builder import build_sdmx_pipeline -from tools.agentic_import.sdmx_pipeline_config import (PipelineConfig, - RunConfig, SampleConfig, - SdmxConfig, - SdmxDataflowConfig) +from tools.agentic_import.sdmx_pipeline_config import ( + FLAG_SDMX_AGENCY, + FLAG_SDMX_DATAFLOW_ID, + FLAG_SDMX_DATAFLOW_KEY, + FLAG_SDMX_DATAFLOW_PARAM, + FLAG_SDMX_ENDPOINT, + PipelineConfig, + RunConfig, + SampleConfig, + SdmxConfig, + SdmxDataflowConfig, +) from tools.agentic_import.sdmx_pipeline_steps import SdmxStep from tools.agentic_import.state_handler import StateHandler, StepState # Flag names -_FLAG_SDMX_ENDPOINT = "sdmx.endpoint" -_FLAG_SDMX_AGENCY = "sdmx.agency" -_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id" -_FLAG_SDMX_DATAFLOW_KEY = "sdmx.dataflow.key" -_FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param" _FLAG_SAMPLE_ROWS = "sample.rows" FLAGS = flags.FLAGS def _define_flags() -> None: - flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") - flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT) + flags.DEFINE_string(FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.") + flags.mark_flag_as_required(FLAG_SDMX_ENDPOINT) - flags.DEFINE_string(_FLAG_SDMX_AGENCY, None, + flags.DEFINE_string(FLAG_SDMX_AGENCY, None, "Owning SDMX agency identifier.") - flags.mark_flag_as_required(_FLAG_SDMX_AGENCY) + flags.mark_flag_as_required(FLAG_SDMX_AGENCY) - flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None, + flags.DEFINE_string(FLAG_SDMX_DATAFLOW_ID, None, "Target SDMX dataflow identifier.") - flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID) + flags.mark_flag_as_required(FLAG_SDMX_DATAFLOW_ID) - flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None, + flags.DEFINE_string(FLAG_SDMX_DATAFLOW_KEY, None, "Optional SDMX key or filter.") flags.DEFINE_string( - _FLAG_SDMX_DATAFLOW_PARAM, None, + FLAG_SDMX_DATAFLOW_PARAM, None, "Optional SDMX parameter appended to the dataflow query.") flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000, @@ -126,11 +129,11 @@ def _resolve_dataset_prefix(config: PipelineConfig) -> str: def _compute_critical_input_hash(config: PipelineConfig) -> str: payload = { - _FLAG_SDMX_AGENCY: config.sdmx.agency, - _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id, - _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint, - _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key, - _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param, + FLAG_SDMX_AGENCY: config.sdmx.agency, + FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id, + FLAG_SDMX_ENDPOINT: config.sdmx.endpoint, + FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key, + FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param, } serialized = json.dumps(payload, sort_keys=True, separators=(",", ":")) return hashlib.sha256(serialized.encode("utf-8")).hexdigest() @@ -296,12 +299,12 @@ def prepare_config() -> PipelineConfig: command = shlex.join(sys.argv) if sys.argv else "python" return PipelineConfig( sdmx=SdmxConfig( - endpoint=FLAGS[_FLAG_SDMX_ENDPOINT].value, - agency=FLAGS[_FLAG_SDMX_AGENCY].value, + endpoint=FLAGS[FLAG_SDMX_ENDPOINT].value, + agency=FLAGS[FLAG_SDMX_AGENCY].value, dataflow=SdmxDataflowConfig( - id=FLAGS[_FLAG_SDMX_DATAFLOW_ID].value, - key=FLAGS[_FLAG_SDMX_DATAFLOW_KEY].value, - param=FLAGS[_FLAG_SDMX_DATAFLOW_PARAM].value, + id=FLAGS[FLAG_SDMX_DATAFLOW_ID].value, + key=FLAGS[FLAG_SDMX_DATAFLOW_KEY].value, + param=FLAGS[FLAG_SDMX_DATAFLOW_PARAM].value, ), ), sample=SampleConfig(rows=FLAGS[_FLAG_SAMPLE_ROWS].value,), diff --git a/tools/agentic_import/sdmx_pipeline_config.py b/tools/agentic_import/sdmx_pipeline_config.py index dd9683518b..d6260eabd7 100644 --- a/tools/agentic_import/sdmx_pipeline_config.py +++ b/tools/agentic_import/sdmx_pipeline_config.py @@ -17,6 +17,13 @@ from dataclasses import dataclass, field +# SDMX flag names shared across pipeline modules. +FLAG_SDMX_ENDPOINT = "sdmx.endpoint" +FLAG_SDMX_AGENCY = "sdmx.agency" +FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id" +FLAG_SDMX_DATAFLOW_KEY = "sdmx.dataflow.key" +FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param" + @dataclass(frozen=True) class SdmxDataflowConfig: diff --git a/tools/agentic_import/sdmx_pipeline_steps.py b/tools/agentic_import/sdmx_pipeline_steps.py index 7d25165b66..53597972b8 100644 --- a/tools/agentic_import/sdmx_pipeline_steps.py +++ b/tools/agentic_import/sdmx_pipeline_steps.py @@ -25,7 +25,12 @@ from absl import logging from tools.agentic_import.pipeline import Step -from tools.agentic_import.sdmx_pipeline_config import PipelineConfig +from tools.agentic_import.sdmx_pipeline_config import ( + FLAG_SDMX_AGENCY, + FLAG_SDMX_DATAFLOW_ID, + FLAG_SDMX_ENDPOINT, + PipelineConfig, +) REPO_ROOT = Path(__file__).resolve().parents[2] @@ -40,12 +45,6 @@ SAMPLE_OUTPUT_DIR = Path("sample_output") FINAL_OUTPUT_DIR = Path("output") -# Flag names (copied for reference/usage in steps if needed, -# though they are mostly used in main for flag definition) -_FLAG_SDMX_ENDPOINT = "sdmx.endpoint" -_FLAG_SDMX_AGENCY = "sdmx.agency" -_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id" - def _require_config_field(value: str | None, field_name: str, step_name: str) -> str: @@ -60,11 +59,6 @@ def _run_command(command: Sequence[str], *, verbose: bool) -> None: subprocess.run(command, check=True) -def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None: - command = [sys.executable, str(SDMX_CLI_PATH), *args] - _run_command(command, verbose=verbose) - - class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" @@ -107,11 +101,11 @@ def _prepare_command(self) -> _StepContext: if self._context: return self._context endpoint = _require_config_field(self._config.sdmx.endpoint, - _FLAG_SDMX_ENDPOINT, self.name) + FLAG_SDMX_ENDPOINT, self.name) agency = _require_config_field(self._config.sdmx.agency, - _FLAG_SDMX_AGENCY, self.name) + FLAG_SDMX_AGENCY, self.name) dataflow = _require_config_field(self._config.sdmx.dataflow.id, - _FLAG_SDMX_DATAFLOW_ID, self.name) + FLAG_SDMX_DATAFLOW_ID, self.name) dataset_prefix = self._config.run.dataset_prefix working_dir = Path(self._config.run.working_dir).resolve() output_path = working_dir / f"{dataset_prefix}_data.csv" @@ -168,11 +162,11 @@ def _prepare_command(self) -> _StepContext: if self._context: return self._context endpoint = _require_config_field(self._config.sdmx.endpoint, - _FLAG_SDMX_ENDPOINT, self.name) + FLAG_SDMX_ENDPOINT, self.name) agency = _require_config_field(self._config.sdmx.agency, - _FLAG_SDMX_AGENCY, self.name) + FLAG_SDMX_AGENCY, self.name) dataflow = _require_config_field(self._config.sdmx.dataflow.id, - _FLAG_SDMX_DATAFLOW_ID, self.name) + FLAG_SDMX_DATAFLOW_ID, self.name) dataset_prefix = self._config.run.dataset_prefix working_dir = Path(self._config.run.working_dir).resolve() output_path = working_dir / f"{dataset_prefix}_metadata.xml" @@ -425,11 +419,11 @@ def _prepare_command(self) -> _StepContext: f"{dataset_prefix}_config.json") endpoint = _require_config_field(self._config.sdmx.endpoint, - _FLAG_SDMX_ENDPOINT, self.name) + FLAG_SDMX_ENDPOINT, self.name) agency = _require_config_field(self._config.sdmx.agency, - _FLAG_SDMX_AGENCY, self.name) + FLAG_SDMX_AGENCY, self.name) dataflow = _require_config_field(self._config.sdmx.dataflow.id, - _FLAG_SDMX_DATAFLOW_ID, self.name) + FLAG_SDMX_DATAFLOW_ID, self.name) dataset_url = (f"{endpoint.rstrip('/')}/data/" f"{agency},{dataflow},") From e9105a9829d9702f8846066301e71b1f46ceac13 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 8 Dec 2025 05:19:00 +0000 Subject: [PATCH 52/54] fix: support string version identifiers --- tools/agentic_import/pipeline.py | 8 +-- tools/agentic_import/pipeline_test.py | 6 +-- .../sdmx_import_pipeline_test.py | 50 +++++++++---------- tools/agentic_import/sdmx_pipeline_builder.py | 7 ++- tools/agentic_import/sdmx_pipeline_steps.py | 16 +++--- tools/agentic_import/state_handler.py | 2 +- 6 files changed, 44 insertions(+), 45 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index 43d009c3b6..d33abb8534 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -37,8 +37,8 @@ def name(self) -> str: @property @abc.abstractmethod - def version(self) -> int: - """Version used for invalidation decisions.""" + def version(self) -> str: + """Version string used for invalidation decisions.""" @abc.abstractmethod def run(self) -> None: @@ -48,7 +48,7 @@ def run(self) -> None: class BaseStep(Step, abc.ABC): """Helper base class that stores mandatory metadata.""" - def __init__(self, *, name: str, version: int) -> None: + def __init__(self, *, name: str, version: str) -> None: if not name: raise ValueError("step requires a name") self._name = name @@ -59,7 +59,7 @@ def name(self) -> str: return self._name @property - def version(self) -> int: + def version(self) -> str: return self._version diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py index 52944546f1..2abfc4c45c 100644 --- a/tools/agentic_import/pipeline_test.py +++ b/tools/agentic_import/pipeline_test.py @@ -31,7 +31,7 @@ class _TrackingStep(BaseStep): def __init__(self, name: str, events: list[str]) -> None: - super().__init__(name=name, version=1) + super().__init__(name=name, version="1") self._events = events self.executed = False @@ -42,7 +42,7 @@ def run(self) -> None: class _FailingStep(BaseStep): - def __init__(self, *, name: str, version: int) -> None: + def __init__(self, *, name: str, version: str) -> None: super().__init__(name=name, version=version) def run(self) -> None: @@ -132,7 +132,7 @@ def after_step(self, self.after_calls.append((name, error_name)) callback = RecordingCallback() - pipeline = Pipeline(steps=[_FailingStep(name="fail-step", version=1)]) + pipeline = Pipeline(steps=[_FailingStep(name="fail-step", version="1")]) with self.assertRaises(ValueError): PipelineRunner(RunnerConfig()).run(pipeline, callback) diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py index 88b86b5721..367ee0364f 100644 --- a/tools/agentic_import/sdmx_import_pipeline_test.py +++ b/tools/agentic_import/sdmx_import_pipeline_test.py @@ -77,7 +77,7 @@ def __call__(self) -> datetime: class _RecordingStep(SdmxStep): def __init__(self, name: str, *, should_fail: bool = False) -> None: - super().__init__(name=name, version=1, config=_TEST_CONFIG) + super().__init__(name=name, version="1", config=_TEST_CONFIG) self._should_fail = should_fail def run(self) -> None: @@ -90,7 +90,7 @@ def dry_run(self) -> None: class _VersionedStep(SdmxStep): - def __init__(self, name: str, version: int) -> None: + def __init__(self, name: str, version: str) -> None: super().__init__(name=name, version=version, config=_TEST_CONFIG) def run(self) -> None: @@ -188,7 +188,7 @@ def test_abort_skips_state_persistence(self) -> None: "updated_at_ts": 1, "steps": { "existing.step": { - "version": 1, + "version": "1", "status": "succeeded", "started_at": "2025-01-01T00:00:00Z", "started_at_ts": 0, @@ -207,7 +207,7 @@ class _AbortStep(SdmxStep): def __init__(self) -> None: super().__init__(name="download.download-data", - version=1, + version="1", config=_TEST_CONFIG) def run(self) -> None: @@ -314,7 +314,7 @@ def _empty_state(self) -> PipelineState: steps={}) def _state_with( - self, versions: dict[str, tuple[int, str, + self, versions: dict[str, tuple[str, str, int | None]]) -> PipelineState: steps = { name: @@ -379,12 +379,12 @@ def test_timestamp_chaining_triggers_next_step(self) -> None: newer = 2_000 older = 1_000 state = self._state_with({ - "download-data": (1, "succeeded", newer), - "download-metadata": (1, "succeeded", older), - "create-sample": (1, "succeeded", older), - "create-schema-mapping": (1, "succeeded", older), - "process-full-data": (1, "succeeded", older), - "create-dc-config": (1, "succeeded", older), + "download-data": ("1", "succeeded", newer), + "download-metadata": ("1", "succeeded", older), + "create-sample": ("1", "succeeded", older), + "create-schema-mapping": ("1", "succeeded", older), + "process-full-data": ("1", "succeeded", older), + "create-dc-config": ("1", "succeeded", older), }) cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND)) names = self._names_from_builder(cfg, state=state) @@ -412,8 +412,8 @@ def test_run_only_ignores_timestamp_chaining(self) -> None: newer = 4_000 older = 3_000 state = self._state_with({ - "download-data": (1, "succeeded", newer), - "download-metadata": (1, "succeeded", older), + "download-data": ("1", "succeeded", newer), + "download-metadata": ("1", "succeeded", older), }) cfg = PipelineConfig( run=RunConfig(command=_TEST_COMMAND, run_only="download-data")) @@ -422,14 +422,14 @@ def test_run_only_ignores_timestamp_chaining(self) -> None: def test_version_bump_schedules_downstream(self) -> None: steps = [ - _VersionedStep("download-data", 1), - _VersionedStep("process-full-data", 2), - _VersionedStep("create-dc-config", 1), + _VersionedStep("download-data", "1"), + _VersionedStep("process-full-data", "2"), + _VersionedStep("create-dc-config", "1"), ] state = self._state_with({ - "download-data": (1, "succeeded", 1000), - "process-full-data": (1, "succeeded", 1000), - "create-dc-config": (1, "succeeded", 1000), + "download-data": ("1", "succeeded", 1000), + "process-full-data": ("1", "succeeded", 1000), + "create-dc-config": ("1", "succeeded", 1000), }) cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND)) names = self._names_from_builder(cfg, steps, state) @@ -441,12 +441,12 @@ def test_version_bump_schedules_downstream(self) -> None: def test_incremental_records_skip_reasons(self) -> None: state = self._state_with({ - "download-data": (1, "succeeded", 1_000), - "download-metadata": (1, "succeeded", 1_000), - "create-sample": (1, "succeeded", 1_000), - "create-schema-mapping": (1, "succeeded", 1_000), - "process-full-data": (1, "succeeded", 1_000), - "create-dc-config": (1, "succeeded", 1_000), + "download-data": ("1", "succeeded", 1_000), + "download-metadata": ("1", "succeeded", 1_000), + "create-sample": ("1", "succeeded", 1_000), + "create-schema-mapping": ("1", "succeeded", 1_000), + "process-full-data": ("1", "succeeded", 1_000), + "create-dc-config": ("1", "succeeded", 1_000), }) cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND)) steps = build_steps(cfg) diff --git a/tools/agentic_import/sdmx_pipeline_builder.py b/tools/agentic_import/sdmx_pipeline_builder.py index 8d8c01ee64..e7588577e5 100644 --- a/tools/agentic_import/sdmx_pipeline_builder.py +++ b/tools/agentic_import/sdmx_pipeline_builder.py @@ -142,11 +142,10 @@ def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]: needs_run = True reason = (f"Previous run status was {prev_state.status}; " "rerunning step") - elif prev_state.version < step.version: + elif prev_state.version != step.version: needs_run = True - reason = ( - f"Step version increased from {prev_state.version} to " - f"{step.version}; rerunning step") + reason = (f"Step version changed from {prev_state.version} to " + f"{step.version}; rerunning step") else: needs_run = False reason = ("Previous run succeeded with same version; step is " diff --git a/tools/agentic_import/sdmx_pipeline_steps.py b/tools/agentic_import/sdmx_pipeline_steps.py index 53597972b8..9455a811c3 100644 --- a/tools/agentic_import/sdmx_pipeline_steps.py +++ b/tools/agentic_import/sdmx_pipeline_steps.py @@ -62,7 +62,7 @@ def _run_command(command: Sequence[str], *, verbose: bool) -> None: class SdmxStep(Step): """Base class for SDMX steps that carries immutable config and version.""" - def __init__(self, *, name: str, version: int, + def __init__(self, *, name: str, version: str, config: PipelineConfig) -> None: if not name: raise ValueError("step requires a name") @@ -75,7 +75,7 @@ def name(self) -> str: return self._name @property - def version(self) -> int: + def version(self) -> str: return self._version @abc.abstractmethod @@ -86,7 +86,7 @@ def dry_run(self) -> None: class DownloadDataStep(SdmxStep): """Downloads SDMX data payloads.""" - VERSION = 1 + VERSION = "1" @dataclass(frozen=True) class _StepContext: @@ -147,7 +147,7 @@ def dry_run(self) -> None: class DownloadMetadataStep(SdmxStep): """Downloads SDMX metadata payloads.""" - VERSION = 1 + VERSION = "1" @dataclass(frozen=True) class _StepContext: @@ -204,7 +204,7 @@ def dry_run(self) -> None: class CreateSampleStep(SdmxStep): """Creates a sample dataset from downloaded data.""" - VERSION = 1 + VERSION = "1" @dataclass(frozen=True) class _StepContext: @@ -258,7 +258,7 @@ def dry_run(self) -> None: class CreateSchemaMapStep(SdmxStep): """Builds schema mappings for transformed data.""" - VERSION = 1 + VERSION = "1" @dataclass(frozen=True) class _StepContext: @@ -323,7 +323,7 @@ def dry_run(self) -> None: class ProcessFullDataStep(SdmxStep): """Processes full SDMX data into DC artifacts.""" - VERSION = 1 + VERSION = "1" RUN_OUTPUT_COLUMNS: ClassVar[str] = ( "observationDate,observationAbout,variableMeasured,value," @@ -397,7 +397,7 @@ def dry_run(self) -> None: class CreateDcConfigStep(SdmxStep): """Generates Datacommons configuration artifacts.""" - VERSION = 1 + VERSION = "1" @dataclass(frozen=True) class _StepContext: diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py index 31dabccc1f..ea1d593197 100644 --- a/tools/agentic_import/state_handler.py +++ b/tools/agentic_import/state_handler.py @@ -32,7 +32,7 @@ @dataclass_json @dataclass class StepState: - version: int + version: str status: str started_at: str ended_at: str From a2b81bdecb1211fc1d53cfcc91837daad9a5ebcb Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 8 Dec 2025 05:43:01 +0000 Subject: [PATCH 53/54] removed del --- tools/agentic_import/pipeline.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index d33abb8534..bbccc60862 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -73,11 +73,9 @@ class PipelineCallback: def before_step(self, step: Step) -> None: """Called immediately before `step.run()`; raising an error skips execution.""" - del step def after_step(self, step: Step, *, error: Exception | None = None) -> None: """Runs once per step after `step.run()` succeeds or raises.""" - del step, error class CompositeCallback(PipelineCallback): From 17f43bfaaeca4c8e0507e4bf703872db6e0461ab Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Wed, 17 Dec 2025 09:14:55 +0000 Subject: [PATCH 54/54] add comment --- tools/agentic_import/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py index bbccc60862..57b234da98 100644 --- a/tools/agentic_import/pipeline.py +++ b/tools/agentic_import/pipeline.py @@ -42,7 +42,7 @@ def version(self) -> str: @abc.abstractmethod def run(self) -> None: - """Execute the step.""" + """Execute the step. Raise an exception to signal failure.""" class BaseStep(Step, abc.ABC):