From 780074e889857fdc07a163880074db0d49a88521 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 10 Nov 2025 16:11:12 +0000
Subject: [PATCH 01/54] Add agentic pipeline runner scaffold Include callbacks,
 abort handling, and logging Cover runner lifecycle with new unit tests

---
 tools/agentic_import/pipeline.py      | 127 ++++++++++++++++++++++++++
 tools/agentic_import/pipeline_test.py |  77 ++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 tools/agentic_import/pipeline.py
 create mode 100644 tools/agentic_import/pipeline_test.py

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
new file mode 100644
index 0000000000..ecd9211331
--- /dev/null
+++ b/tools/agentic_import/pipeline.py
@@ -0,0 +1,127 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generic building blocks for lightweight agentic pipelines.
+"""
+
+from __future__ import annotations
+
+import abc
+from dataclasses import dataclass
+from typing import Sequence
+
+from absl import logging
+
+
+class PipelineAbort(Exception):
+    """Raised when pipeline execution should stop early for any reason."""
+
+
+class Step(abc.ABC):
+    """Abstract pipeline step interface."""
+
+    @property
+    @abc.abstractmethod
+    def name(self) -> str:
+        """Human friendly identifier used for logging."""
+
+    @property
+    @abc.abstractmethod
+    def version(self) -> int:
+        """Version used for invalidation decisions."""
+
+    @abc.abstractmethod
+    def run(self) -> None:
+        """Execute the step."""
+
+    @abc.abstractmethod
+    def dry_run(self) -> str:
+        """Return a read-only preview of the work to be done."""
+
+
+class BaseStep(Step, abc.ABC):
+    """Helper base class that stores mandatory metadata."""
+
+    def __init__(self, *, name: str, version: int) -> None:
+        if not name:
+            raise ValueError("step requires a name")
+        self._name = name
+        self._version = version
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def version(self) -> int:
+        return self._version
+
+
+@dataclass(frozen=True)
+class Pipeline:
+    steps: Sequence[Step]
+
+    def get_steps(self) -> list[Step]:
+        return list(self.steps)
+
+
+class PipelineCallback:
+    """Lifecycle hooks consumed by the runner; defaults are no-ops."""
+
+    def before_step(self, step: Step) -> None:
+        del step
+
+    def after_step(self, step: Step, *, error: Exception | None = None) -> None:
+        del step, error
+
+
+@dataclass(frozen=True)
+class RunnerConfig:
+    """Placeholder for future runner toggles."""
+
+
+class PipelineRunner:
+
+    def __init__(self, config: RunnerConfig | None = None) -> None:
+        self._config = config or RunnerConfig()
+
+    def run(self,
+            pipeline: Pipeline,
+            callback: PipelineCallback | None = None) -> None:
+        current_step: Step | None = None
+        steps = pipeline.get_steps()
+        logging.info("Starting pipeline with %d steps", len(steps))
+        try:
+            for step in steps:
+                current_step = step
+                logging.info("Preparing step %s (v%d)", step.name, step.version)
+                if callback:
+                    callback.before_step(step)
+                try:
+                    step.run()
+                except PipelineAbort as exc:
+                    if callback:
+                        callback.after_step(step, error=exc)
+                    raise
+                except Exception as exc:  # pylint: disable=broad-except
+                    if callback:
+                        callback.after_step(step, error=exc)
+                    logging.exception("Step %s failed", step.name)
+                    raise
+                if callback:
+                    callback.after_step(step)
+                logging.info("Finished step %s", step.name)
+            logging.info("Pipeline completed")
+        except PipelineAbort as exc:
+            name = current_step.name if current_step else "<none>"
+            logging.info("Pipeline aborted at %s", name)
diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
new file mode 100644
index 0000000000..1bd66aee15
--- /dev/null
+++ b/tools/agentic_import/pipeline_test.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Unit tests for the Phase 0 pipeline skeleton."""
+
+import os
+import sys
+import unittest
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+
+from pipeline import (  # pylint: disable=import-error
+    BaseStep, Pipeline, PipelineAbort, PipelineCallback, PipelineRunner,
+    RunnerConfig, Step,
+)
+
+
+class _TrackingStep(BaseStep):
+
+    def __init__(self, name: str, events: list[str]) -> None:
+        super().__init__(name=name, version=1)
+        self._events = events
+        self.executed = False
+
+    def run(self) -> None:
+        self.executed = True
+        self._events.append(f"run:{self.name}")
+
+    def dry_run(self) -> str:
+        return ""
+
+
+class PipelineRunnerTest(unittest.TestCase):
+
+    def _build_pipeline(self, events: list[str]) -> Pipeline:
+        step_one = _TrackingStep("one", events)
+        step_two = _TrackingStep("two", events)
+        return Pipeline(steps=[step_one, step_two])
+
+    def test_on_before_step_runs_before_each_step(self) -> None:
+        events: list[str] = []
+
+        class RecordingCallback(PipelineCallback):
+
+            def before_step(self, step: Step) -> None:
+                events.append(f"before:{step.name}")
+
+        pipeline = self._build_pipeline(events)
+        PipelineRunner(RunnerConfig()).run(pipeline, RecordingCallback())
+
+        self.assertEqual(
+            events,
+            [
+                "before:one",
+                "run:one",
+                "before:two",
+                "run:two",
+            ],
+        )
+
+    def test_pipeline_abort_skips_downstream_steps(self) -> None:
+        events: list[str] = []
+        pipeline = self._build_pipeline(events)
+        runner = PipelineRunner(RunnerConfig())
+
+        class AbortOnSecond(PipelineCallback):
+
+            def before_step(self, step: Step) -> None:
+                if step.name == "two":
+                    raise PipelineAbort("stop")
+
+        runner.run(pipeline, AbortOnSecond())
+
+        self.assertEqual(events, ["run:one"])
+
+
+if __name__ == "__main__":
+    unittest.main()

From c125d782f4020a10081a6bdc96fdc17bb8393322 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 10 Nov 2025 17:06:02 +0000
Subject: [PATCH 02/54] Add SDMX pipeline state callback and tests Capture step
 timing and errors in JSON state files

---
 tools/agentic_import/pipeline_test.py         |  15 ++
 tools/agentic_import/sdmx_import_pipeline.py  | 112 ++++++++++++++
 .../sdmx_import_pipeline_test.py              | 137 ++++++++++++++++++
 3 files changed, 264 insertions(+)
 create mode 100644 tools/agentic_import/sdmx_import_pipeline.py
 create mode 100644 tools/agentic_import/sdmx_import_pipeline_test.py

diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index 1bd66aee15..6ba51fcbcb 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Unit tests for the Phase 0 pipeline skeleton."""
 
 import os
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
new file mode 100644
index 0000000000..0107ce381f
--- /dev/null
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -0,0 +1,112 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpers for the SDMX agentic import pipeline."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Callable
+
+from absl import logging
+
+from tools.agentic_import.pipeline import PipelineCallback, Step
+
+
+def _format_time(value: datetime) -> str:
+    if value.tzinfo is None:
+        value = value.replace(tzinfo=timezone.utc)
+    return value.isoformat()
+
+
+@dataclass
+class StepState:
+    version: int
+    status: str
+    started_at: str
+    ended_at: str
+    duration_s: float
+    message: str | None = None
+
+
+@dataclass
+class PipelineState:
+    run_id: str
+    critical_input_hash: str
+    command: str
+    updated_at: str
+    steps: dict[str, StepState] = field(default_factory=dict)
+
+
+class JSONStateCallback(PipelineCallback):
+    """Persists pipeline progress to the SDMX state file.
+
+    The callback is intentionally unaware of planning concerns. The CLI computes
+    identifiers such as run_id and critical_input_hash before invoking the
+    runner, then instantiates this callback with the desired destination file.
+    """
+
+    def __init__(self,
+                 *,
+                 state_path: str | Path,
+                 run_id: str,
+                 critical_input_hash: str,
+                 command: str,
+                 now_fn: Callable[[], datetime] | None = None) -> None:
+        self._state_path = Path(state_path)
+        self._now_fn = now_fn or (lambda: datetime.now(timezone.utc))
+        self._state = PipelineState(
+            run_id=run_id,
+            critical_input_hash=critical_input_hash,
+            command=command,
+            updated_at=_format_time(self._now()),
+        )
+        self._step_start_times: dict[str, datetime] = {}
+        self._state_path.parent.mkdir(parents=True, exist_ok=True)
+        logging.info("JSON state will be written to %s", self._state_path)
+
+    def before_step(self, step: Step) -> None:
+        started_at = self._now()
+        self._step_start_times[step.name] = started_at
+
+    def after_step(self, step: Step, *, error: Exception | None = None) -> None:
+        ended_at = self._now()
+        started_at = self._step_start_times.pop(step.name, None)
+        if started_at is None:
+            started_at = ended_at
+        duration = max(0.0, (ended_at - started_at).total_seconds())
+        step_state = StepState(
+            version=step.version,
+            status="failed" if error else "succeeded",
+            started_at=_format_time(started_at),
+            ended_at=_format_time(ended_at),
+            duration_s=duration,
+            message=str(error) or error.__class__.__name__ if error else None,
+        )
+        self._state.steps[step.name] = step_state
+        self._state.updated_at = step_state.ended_at
+        self._write_state()
+
+    def _now(self) -> datetime:
+        return self._now_fn()
+
+    def _write_state(self) -> None:
+        temp_path = self._state_path.with_suffix(self._state_path.suffix + ".tmp")
+        with temp_path.open("w", encoding="utf-8") as fp:
+            json.dump(asdict(self._state), fp, indent=2, sort_keys=True)
+            fp.write("\n")
+        temp_path.replace(self._state_path)
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
new file mode 100644
index 0000000000..792aac3f01
--- /dev/null
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for the Phase 1 JSON state callback."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import tempfile
+import unittest
+from datetime import datetime, timedelta, timezone
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_REPO_ROOT = os.path.dirname(_SCRIPT_DIR)
+_PROJECT_ROOT = os.path.dirname(_REPO_ROOT)
+for path in (_PROJECT_ROOT,):
+    if path not in sys.path:
+        sys.path.append(path)
+
+from tools.agentic_import.pipeline import (  # pylint: disable=import-error
+    BaseStep,
+    Pipeline,
+    PipelineRunner,
+    RunnerConfig,
+)
+from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
+    JSONStateCallback,
+)
+
+
+class _IncrementingClock:
+
+    def __init__(self, start: datetime, step: timedelta) -> None:
+        self._value = start
+        self._step = step
+        self._first_call = True
+
+    def __call__(self) -> datetime:
+        if self._first_call:
+            self._first_call = False
+            return self._value
+        self._value = self._value + self._step
+        return self._value
+
+
+class _RecordingStep(BaseStep):
+
+    def __init__(self, name: str, *, should_fail: bool = False) -> None:
+        super().__init__(name=name, version=1)
+        self._should_fail = should_fail
+
+    def run(self) -> None:
+        if self._should_fail:
+            raise ValueError("boom")
+
+    def dry_run(self) -> str:
+        return "noop"
+
+
+class JSONStateCallbackTest(unittest.TestCase):
+
+    def _build_callback(self, *, tmpdir: str,
+                        clock: _IncrementingClock) -> JSONStateCallback:
+        state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
+        return JSONStateCallback(
+            state_path=state_path,
+            run_id="demo",
+            critical_input_hash="abc123",
+            command="python run",
+            now_fn=clock,
+        )
+
+    def test_successful_step_persists_expected_schema(self) -> None:
+        clock = _IncrementingClock(
+            datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc),
+            timedelta(seconds=5))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            callback = self._build_callback(tmpdir=tmpdir, clock=clock)
+            pipeline = Pipeline(steps=[_RecordingStep("download.download-data")])
+            runner = PipelineRunner(RunnerConfig())
+            runner.run(pipeline, callback)
+
+            state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
+            with open(state_path, encoding="utf-8") as fp:
+                state = json.load(fp)
+
+        step_state = state["steps"]["download.download-data"]
+        self.assertEqual(state["run_id"], "demo")
+        self.assertEqual(state["critical_input_hash"], "abc123")
+        self.assertEqual(step_state["status"], "succeeded")
+        self.assertIn("started_at", step_state)
+        self.assertIn("ended_at", step_state)
+        self.assertAlmostEqual(step_state["duration_s"], 5.0)
+        self.assertIn("message", step_state)
+        self.assertIsNone(step_state["message"])
+        self.assertEqual(state["updated_at"], step_state["ended_at"])
+
+    def test_failed_step_records_error_and_persists_file(self) -> None:
+        clock = _IncrementingClock(
+            datetime(2025, 1, 2, 0, 0, tzinfo=timezone.utc),
+            timedelta(seconds=7))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            callback = self._build_callback(tmpdir=tmpdir, clock=clock)
+            pipeline = Pipeline(
+                steps=[_RecordingStep("sample.create-sample", should_fail=True)])
+            runner = PipelineRunner(RunnerConfig())
+
+            with self.assertRaisesRegex(ValueError, "boom"):
+                runner.run(pipeline, callback)
+
+            state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
+            with open(state_path, encoding="utf-8") as fp:
+                state = json.load(fp)
+
+        step_state = state["steps"]["sample.create-sample"]
+        self.assertEqual(step_state["status"], "failed")
+        self.assertIn("boom", step_state["message"])
+        self.assertAlmostEqual(step_state["duration_s"], 7.0)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5271fd3837f4e79b3993cacbcb72b514d9d368a4 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 10 Nov 2025 17:07:59 +0000
Subject: [PATCH 03/54] Polish agentic import docstrings and format Align
 module documentation and line wrapping with current style

---
 tools/agentic_import/pipeline_test.py         |  3 +--
 tools/agentic_import/sdmx_import_pipeline.py  |  4 ++--
 .../sdmx_import_pipeline_test.py              | 19 ++++++++-----------
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index 6ba51fcbcb..b68864d1fa 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""Unit tests for the Phase 0 pipeline skeleton."""
+"""Unit tests for the agentic pipeline skeleton."""
 
 import os
 import sys
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 0107ce381f..c96b8d6f72 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Helpers for the SDMX agentic import pipeline."""
 
 from __future__ import annotations
@@ -105,7 +104,8 @@ def _now(self) -> datetime:
         return self._now_fn()
 
     def _write_state(self) -> None:
-        temp_path = self._state_path.with_suffix(self._state_path.suffix + ".tmp")
+        temp_path = self._state_path.with_suffix(self._state_path.suffix +
+                                                 ".tmp")
         with temp_path.open("w", encoding="utf-8") as fp:
             json.dump(asdict(self._state), fp, indent=2, sort_keys=True)
             fp.write("\n")
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 792aac3f01..a9e94e5b0d 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""Unit tests for the Phase 1 JSON state callback."""
+"""Unit tests for SDMX pipeline helpers."""
 
 from __future__ import annotations
 
@@ -33,14 +32,10 @@
         sys.path.append(path)
 
 from tools.agentic_import.pipeline import (  # pylint: disable=import-error
-    BaseStep,
-    Pipeline,
-    PipelineRunner,
-    RunnerConfig,
+    BaseStep, Pipeline, PipelineRunner, RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
-    JSONStateCallback,
-)
+    JSONStateCallback,)
 
 
 class _IncrementingClock:
@@ -91,7 +86,8 @@ def test_successful_step_persists_expected_schema(self) -> None:
             timedelta(seconds=5))
         with tempfile.TemporaryDirectory() as tmpdir:
             callback = self._build_callback(tmpdir=tmpdir, clock=clock)
-            pipeline = Pipeline(steps=[_RecordingStep("download.download-data")])
+            pipeline = Pipeline(
+                steps=[_RecordingStep("download.download-data")])
             runner = PipelineRunner(RunnerConfig())
             runner.run(pipeline, callback)
 
@@ -116,8 +112,9 @@ def test_failed_step_records_error_and_persists_file(self) -> None:
             timedelta(seconds=7))
         with tempfile.TemporaryDirectory() as tmpdir:
             callback = self._build_callback(tmpdir=tmpdir, clock=clock)
-            pipeline = Pipeline(
-                steps=[_RecordingStep("sample.create-sample", should_fail=True)])
+            pipeline = Pipeline(steps=[
+                _RecordingStep("sample.create-sample", should_fail=True)
+            ])
             runner = PipelineRunner(RunnerConfig())
 
             with self.assertRaisesRegex(ValueError, "boom"):

From d6441f669e0a16305eaecc6d3259b3aa60d50fe6 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 11 Nov 2025 06:02:25 +0000
Subject: [PATCH 04/54] Improve agentic import runner observability Add state
 handler persistence and dataclasses-json dependency

---
 requirements_all.txt                          |   1 +
 tools/agentic_import/pipeline.py              |  27 ++--
 tools/agentic_import/pipeline_test.py         |  59 +++++++++
 tools/agentic_import/sdmx_import_pipeline.py  |  73 ++++------
 .../sdmx_import_pipeline_test.py              |  72 ++++++++--
 tools/agentic_import/state_handler.py         | 125 ++++++++++++++++++
 tools/agentic_import/state_handler_test.py    |  73 ++++++++++
 7 files changed, 358 insertions(+), 72 deletions(-)
 create mode 100644 tools/agentic_import/state_handler.py
 create mode 100644 tools/agentic_import/state_handler_test.py

diff --git a/requirements_all.txt b/requirements_all.txt
index 9ce4e5a975..6983f07837 100644
--- a/requirements_all.txt
+++ b/requirements_all.txt
@@ -9,6 +9,7 @@
 
 absl-py
 chembl-webresource-client
+dataclasses-json
 deepdiff
 earthengine-api
 flask_restful
diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index ecd9211331..cad62d89c0 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -79,9 +79,11 @@ class PipelineCallback:
     """Lifecycle hooks consumed by the runner; defaults are no-ops."""
 
     def before_step(self, step: Step) -> None:
+        """Called immediately before `step.run()`; raising an error skips execution."""
         del step
 
     def after_step(self, step: Step, *, error: Exception | None = None) -> None:
+        """Runs once per step after `step.run()` succeeds or raises."""
         del step, error
 
 
@@ -100,28 +102,25 @@ def run(self,
             callback: PipelineCallback | None = None) -> None:
         current_step: Step | None = None
         steps = pipeline.get_steps()
-        logging.info("Starting pipeline with %d steps", len(steps))
+        logging.info(f"Starting pipeline with {len(steps)} steps")
         try:
             for step in steps:
                 current_step = step
-                logging.info("Preparing step %s (v%d)", step.name, step.version)
+                logging.info(f"Preparing step {step.name} (v{step.version})")
                 if callback:
                     callback.before_step(step)
+                error: Exception | None = None
                 try:
                     step.run()
-                except PipelineAbort as exc:
-                    if callback:
-                        callback.after_step(step, error=exc)
-                    raise
                 except Exception as exc:  # pylint: disable=broad-except
-                    if callback:
-                        callback.after_step(step, error=exc)
-                    logging.exception("Step %s failed", step.name)
+                    error = exc
+                    logging.exception(f"Step {step.name} failed")
                     raise
-                if callback:
-                    callback.after_step(step)
-                logging.info("Finished step %s", step.name)
+                finally:
+                    if callback:
+                        callback.after_step(step, error=error)
+                logging.info(f"Finished step {step.name}")
             logging.info("Pipeline completed")
-        except PipelineAbort as exc:
+        except PipelineAbort:
             name = current_step.name if current_step else "<none>"
-            logging.info("Pipeline aborted at %s", name)
+            logging.info(f"Pipeline aborted at {name}")
diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index b68864d1fa..5b5ccfab07 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -43,6 +43,18 @@ def dry_run(self) -> str:
         return ""
 
 
+class _FailingStep(BaseStep):
+
+    def __init__(self, *, name: str, version: int) -> None:
+        super().__init__(name=name, version=version)
+
+    def run(self) -> None:
+        raise ValueError("boom")
+
+    def dry_run(self) -> str:
+        return ""
+
+
 class PipelineRunnerTest(unittest.TestCase):
 
     def _build_pipeline(self, events: list[str]) -> Pipeline:
@@ -85,6 +97,53 @@ def before_step(self, step: Step) -> None:
         runner.run(pipeline, AbortOnSecond())
 
         self.assertEqual(events, ["run:one"])
+        # PipelineAbort is swallowed by the runner, so execution simply stops.
+
+    def test_before_step_exception_skips_after_step(self) -> None:
+        events: list[str] = []
+        pipeline = Pipeline(steps=[_TrackingStep("one", events)])
+        runner = PipelineRunner(RunnerConfig())
+
+        class RecordingCallback(PipelineCallback):
+
+            def before_step(self, step: Step) -> None:
+                events.append(f"before:{step.name}")
+                raise RuntimeError("boom")
+
+            def after_step(self,
+                           step: Step,
+                           *,
+                           error: Exception | None = None) -> None:
+                del step, error
+                events.append("after-called")
+
+        with self.assertRaises(RuntimeError):
+            runner.run(pipeline, RecordingCallback())
+
+        self.assertEqual(events, ["before:one"])
+
+    def test_after_step_receives_error_when_step_fails(self) -> None:
+
+        class RecordingCallback(PipelineCallback):
+
+            def __init__(self) -> None:
+                self.after_calls: list[tuple[str, str | None]] = []
+
+            def after_step(self,
+                           step: Step,
+                           *,
+                           error: Exception | None = None) -> None:
+                name = step.name
+                error_name = type(error).__name__ if error else None
+                self.after_calls.append((name, error_name))
+
+        callback = RecordingCallback()
+        pipeline = Pipeline(steps=[_FailingStep(name="fail-step", version=1)])
+
+        with self.assertRaises(ValueError):
+            PipelineRunner(RunnerConfig()).run(pipeline, callback)
+
+        self.assertEqual(callback.after_calls, [("fail-step", "ValueError")])
 
 
 if __name__ == "__main__":
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index c96b8d6f72..e49a8e19cb 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -15,15 +15,13 @@
 
 from __future__ import annotations
 
-import json
-from dataclasses import asdict, dataclass, field
 from datetime import datetime, timezone
-from pathlib import Path
 from typing import Callable
 
 from absl import logging
 
-from tools.agentic_import.pipeline import PipelineCallback, Step
+from tools.agentic_import.pipeline import PipelineAbort, PipelineCallback, Step
+from tools.agentic_import.state_handler import StateHandler, StepState
 
 
 def _format_time(value: datetime) -> str:
@@ -32,51 +30,29 @@ def _format_time(value: datetime) -> str:
     return value.isoformat()
 
 
-@dataclass
-class StepState:
-    version: int
-    status: str
-    started_at: str
-    ended_at: str
-    duration_s: float
-    message: str | None = None
-
-
-@dataclass
-class PipelineState:
-    run_id: str
-    critical_input_hash: str
-    command: str
-    updated_at: str
-    steps: dict[str, StepState] = field(default_factory=dict)
-
-
 class JSONStateCallback(PipelineCallback):
-    """Persists pipeline progress to the SDMX state file.
+    """Persists pipeline progress to the SDMX state file via StateHandler.
 
-    The callback is intentionally unaware of planning concerns. The CLI computes
-    identifiers such as run_id and critical_input_hash before invoking the
-    runner, then instantiates this callback with the desired destination file.
+    This callback assumes a single process owns the state file for the lifetime
+    of the run. The CLI or builder sets run metadata up-front; this class only
+    mutates state after a step executes.
     """
 
     def __init__(self,
                  *,
-                 state_path: str | Path,
+                 state_handler: StateHandler,
                  run_id: str,
                  critical_input_hash: str,
                  command: str,
                  now_fn: Callable[[], datetime] | None = None) -> None:
-        self._state_path = Path(state_path)
+        self._handler = state_handler
         self._now_fn = now_fn or (lambda: datetime.now(timezone.utc))
-        self._state = PipelineState(
-            run_id=run_id,
-            critical_input_hash=critical_input_hash,
-            command=command,
-            updated_at=_format_time(self._now()),
-        )
+        self._state = self._handler.get_state()
+        self._state.run_id = run_id
+        self._state.critical_input_hash = critical_input_hash
+        self._state.command = command
         self._step_start_times: dict[str, datetime] = {}
-        self._state_path.parent.mkdir(parents=True, exist_ok=True)
-        logging.info("JSON state will be written to %s", self._state_path)
+        logging.info(f"JSON state will be written to {self._handler.path}")
 
     def before_step(self, step: Step) -> None:
         started_at = self._now()
@@ -88,25 +64,28 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None:
         if started_at is None:
             started_at = ended_at
         duration = max(0.0, (ended_at - started_at).total_seconds())
+        if isinstance(error, PipelineAbort):
+            logging.info(
+                f"Skipping state update for {step.name} due to pipeline abort")
+            return
+        if error:
+            message = str(error) or error.__class__.__name__
+        else:
+            message = None
+        # Step stats are persisted only after the step finishes; steps can still
+        # be skipped after their before_step callback runs, so we leave skipped
+        # steps untouched to preserve prior state.
         step_state = StepState(
             version=step.version,
             status="failed" if error else "succeeded",
             started_at=_format_time(started_at),
             ended_at=_format_time(ended_at),
             duration_s=duration,
-            message=str(error) or error.__class__.__name__ if error else None,
+            message=message,
         )
         self._state.steps[step.name] = step_state
         self._state.updated_at = step_state.ended_at
-        self._write_state()
+        self._handler.save_state()
 
     def _now(self) -> datetime:
         return self._now_fn()
-
-    def _write_state(self) -> None:
-        temp_path = self._state_path.with_suffix(self._state_path.suffix +
-                                                 ".tmp")
-        with temp_path.open("w", encoding="utf-8") as fp:
-            json.dump(asdict(self._state), fp, indent=2, sort_keys=True)
-            fp.write("\n")
-        temp_path.replace(self._state_path)
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index a9e94e5b0d..259db3e979 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -32,10 +32,11 @@
         sys.path.append(path)
 
 from tools.agentic_import.pipeline import (  # pylint: disable=import-error
-    BaseStep, Pipeline, PipelineRunner, RunnerConfig,
+    BaseStep, Pipeline, PipelineAbort, PipelineRunner, RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     JSONStateCallback,)
+from tools.agentic_import.state_handler import StateHandler  # pylint: disable=import-error
 
 
 class _IncrementingClock:
@@ -69,30 +70,32 @@ def dry_run(self) -> str:
 
 class JSONStateCallbackTest(unittest.TestCase):
 
-    def _build_callback(self, *, tmpdir: str,
-                        clock: _IncrementingClock) -> JSONStateCallback:
+    def _build_callback(
+            self, *, tmpdir: str, clock: _IncrementingClock
+    ) -> tuple[JSONStateCallback, StateHandler]:
         state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
-        return JSONStateCallback(
-            state_path=state_path,
+        handler = StateHandler(state_path=state_path, dataset_prefix="demo")
+        callback = JSONStateCallback(
+            state_handler=handler,
             run_id="demo",
             critical_input_hash="abc123",
             command="python run",
             now_fn=clock,
         )
+        return callback, handler
 
     def test_successful_step_persists_expected_schema(self) -> None:
         clock = _IncrementingClock(
             datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc),
             timedelta(seconds=5))
         with tempfile.TemporaryDirectory() as tmpdir:
-            callback = self._build_callback(tmpdir=tmpdir, clock=clock)
+            callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock)
             pipeline = Pipeline(
                 steps=[_RecordingStep("download.download-data")])
             runner = PipelineRunner(RunnerConfig())
             runner.run(pipeline, callback)
 
-            state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
-            with open(state_path, encoding="utf-8") as fp:
+            with open(handler.path, encoding="utf-8") as fp:
                 state = json.load(fp)
 
         step_state = state["steps"]["download.download-data"]
@@ -111,7 +114,7 @@ def test_failed_step_records_error_and_persists_file(self) -> None:
             datetime(2025, 1, 2, 0, 0, tzinfo=timezone.utc),
             timedelta(seconds=7))
         with tempfile.TemporaryDirectory() as tmpdir:
-            callback = self._build_callback(tmpdir=tmpdir, clock=clock)
+            callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock)
             pipeline = Pipeline(steps=[
                 _RecordingStep("sample.create-sample", should_fail=True)
             ])
@@ -120,8 +123,7 @@ def test_failed_step_records_error_and_persists_file(self) -> None:
             with self.assertRaisesRegex(ValueError, "boom"):
                 runner.run(pipeline, callback)
 
-            state_path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
-            with open(state_path, encoding="utf-8") as fp:
+            with open(handler.path, encoding="utf-8") as fp:
                 state = json.load(fp)
 
         step_state = state["steps"]["sample.create-sample"]
@@ -129,6 +131,54 @@ def test_failed_step_records_error_and_persists_file(self) -> None:
         self.assertIn("boom", step_state["message"])
         self.assertAlmostEqual(step_state["duration_s"], 7.0)
 
+    def test_abort_skips_state_persistence(self) -> None:
+        clock = _IncrementingClock(
+            datetime(2025, 1, 3, 0, 0, tzinfo=timezone.utc),
+            timedelta(seconds=3))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            state_dir = os.path.join(tmpdir, ".datacommons")
+            os.makedirs(state_dir, exist_ok=True)
+            state_path = os.path.join(state_dir, "demo.state.json")
+            previous = {
+                "run_id": "previous",
+                "critical_input_hash": "old",
+                "command": "old command",
+                "updated_at": "2025-01-01T00:00:00Z",
+                "steps": {
+                    "existing.step": {
+                        "version": 1,
+                        "status": "succeeded",
+                        "started_at": "2025-01-01T00:00:00Z",
+                        "ended_at": "2025-01-01T00:05:00Z",
+                        "duration_s": 300.0,
+                        "message": None,
+                    }
+                },
+            }
+            with open(state_path, "w", encoding="utf-8") as fp:
+                json.dump(previous, fp)
+            callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock)
+
+            class _AbortStep(BaseStep):
+
+                def __init__(self) -> None:
+                    super().__init__(name="download.download-data", version=1)
+
+                def run(self) -> None:
+                    raise PipelineAbort("user requested stop")
+
+                def dry_run(self) -> str:
+                    return "noop"
+
+            pipeline = Pipeline(steps=[_AbortStep()])
+            runner = PipelineRunner(RunnerConfig())
+            runner.run(pipeline, callback)
+
+            with open(handler.path, encoding="utf-8") as fp:
+                state = json.load(fp)
+
+        self.assertEqual(state, previous)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py
new file mode 100644
index 0000000000..cb2cb85b02
--- /dev/null
+++ b/tools/agentic_import/state_handler.py
@@ -0,0 +1,125 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""State file helpers shared by the SDMX agentic pipeline components.
+
+The handler centralizes JSON persistence so callers (builder, callbacks) can
+operate on an in-memory `PipelineState`. This implementation assumes a single
+process has exclusive ownership of the state file for the duration of a run.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+
+from absl import logging
+from dataclasses_json import dataclass_json
+
+
+@dataclass_json
+@dataclass
+class StepState:
+    version: int
+    status: str
+    started_at: str
+    ended_at: str
+    duration_s: float
+    message: str | None = None
+
+
+@dataclass_json
+@dataclass
+class PipelineState:
+    run_id: str
+    critical_input_hash: str
+    command: str
+    updated_at: str
+    steps: dict[str, StepState] = field(default_factory=dict)
+
+
+class StateHandler:
+    """Minimal state manager that owns JSON file I/O."""
+
+    def __init__(self, state_path: str | Path, dataset_prefix: str) -> None:
+        self._state_path = Path(state_path)
+        self._dataset_prefix = dataset_prefix
+        self._state: PipelineState | None = None
+
+    @property
+    def path(self) -> Path:
+        """Returns the backing state file path."""
+        return self._state_path
+
+    def get_state(self) -> PipelineState:
+        if self._state is None:
+            self._state = self._load_or_init()
+        return self._state
+
+    def save_state(self) -> None:
+        state = self.get_state()
+        self._write_state(state)
+
+    def _load_or_init(self) -> PipelineState:
+        path = self._state_path
+        path.parent.mkdir(parents=True, exist_ok=True)
+        if not path.exists():
+            state = self._empty_state()
+            logging.info(f"Creating new state file at {path}")
+            self._write_state(state)
+            return state
+        try:
+            with path.open("r", encoding="utf-8") as fp:
+                data = json.load(fp)
+            state = PipelineState.from_dict(data)
+            if not state.run_id:
+                state.run_id = self._dataset_prefix
+            return state
+        except (OSError, json.JSONDecodeError, ValueError, TypeError) as exc:
+            logging.warning(f"Failed to load state file {path}: {exc}")
+            self._backup_bad_file()
+            state = self._empty_state()
+            self._write_state(state)
+            return state
+
+    def _write_state(self, state: PipelineState) -> None:
+        directory = self._state_path.parent
+        directory.mkdir(parents=True, exist_ok=True)
+        payload = json.dumps(asdict(state), indent=2, sort_keys=True) + "\n"
+        tmp_path = self._state_path.with_suffix(".tmp")
+        tmp_path.write_text(payload, encoding="utf-8")
+        tmp_path.replace(self._state_path)
+
+    def _empty_state(self) -> PipelineState:
+        return PipelineState(
+            run_id=self._dataset_prefix,
+            critical_input_hash="",
+            command="",
+            updated_at="",
+        )
+
+    def _backup_bad_file(self) -> None:
+        path = self._state_path
+        if not path.exists():
+            return
+        timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
+        backup_name = f"{path.name}.bad.{timestamp}.bak"
+        backup_path = path.with_name(backup_name)
+        try:
+            path.replace(backup_path)
+            logging.warning(f"Backed up corrupt state to {backup_path}")
+        except OSError as exc:
+            logging.warning(
+                f"Failed to backup corrupt state file {path}: {exc}")
diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py
new file mode 100644
index 0000000000..b8af010fd7
--- /dev/null
+++ b/tools/agentic_import/state_handler_test.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for the SDMX state handler."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import tempfile
+import unittest
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+if _SCRIPT_DIR not in sys.path:
+    sys.path.append(_SCRIPT_DIR)
+
+from state_handler import StateHandler  # pylint: disable=import-error
+
+
+class StateHandlerTest(unittest.TestCase):
+
+    def test_missing_file_creates_empty_state(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "demo.state.json")
+            handler = StateHandler(state_path=path, dataset_prefix="demo")
+
+            state = handler.get_state()
+
+            self.assertTrue(os.path.exists(path))
+            self.assertEqual(state.run_id, "demo")
+            self.assertEqual(state.steps, {})
+
+            with open(path, encoding="utf-8") as fp:
+                data = json.load(fp)
+            self.assertEqual(data["run_id"], "demo")
+            self.assertEqual(data["steps"], {})
+
+    def test_corrupt_file_creates_backup_and_resets_state(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "demo.state.json")
+            with open(path, "w", encoding="utf-8") as fp:
+                fp.write("{not-json}")
+
+            handler = StateHandler(state_path=path, dataset_prefix="demo")
+            state = handler.get_state()
+
+            backups = [
+                name for name in os.listdir(tmpdir)
+                if name.startswith("demo.state.json.bad.")
+            ]
+            self.assertEqual(state.steps, {})
+            self.assertGreaterEqual(len(backups), 1)
+
+            with open(path, encoding="utf-8") as fp:
+                data = json.load(fp)
+            self.assertEqual(data["steps"], {})
+
+
+if __name__ == "__main__":
+    unittest.main()

From 593db877b32f89fdf353f243e9023dda6baa9c65 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 11 Nov 2025 06:38:47 +0000
Subject: [PATCH 05/54] Add interactive confirmation to SDMX runner Add
 CompositeCallback support for callback stacking Cover interactive and factory
 behavior with tests

---
 tools/agentic_import/pipeline.py              | 15 ++++
 tools/agentic_import/pipeline_test.py         | 43 +++++++++-
 tools/agentic_import/sdmx_import_pipeline.py  | 38 +++++++-
 .../sdmx_import_pipeline_test.py              | 86 ++++++++++++++++++-
 4 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index cad62d89c0..ec3d1b2457 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -87,6 +87,21 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None:
         del step, error
 
 
+class CompositeCallback(PipelineCallback):
+    """Fans out events to child callbacks in order."""
+
+    def __init__(self, callbacks: Sequence[PipelineCallback]) -> None:
+        self._callbacks = list(callbacks)
+
+    def before_step(self, step: Step) -> None:
+        for callback in self._callbacks:
+            callback.before_step(step)
+
+    def after_step(self, step: Step, *, error: Exception | None = None) -> None:
+        for callback in self._callbacks:
+            callback.after_step(step, error=error)
+
+
 @dataclass(frozen=True)
 class RunnerConfig:
     """Placeholder for future runner toggles."""
diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index 5b5ccfab07..67c25ada12 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -23,8 +23,8 @@
 sys.path.append(_SCRIPT_DIR)
 
 from pipeline import (  # pylint: disable=import-error
-    BaseStep, Pipeline, PipelineAbort, PipelineCallback, PipelineRunner,
-    RunnerConfig, Step,
+    BaseStep, CompositeCallback, Pipeline, PipelineAbort, PipelineCallback,
+    PipelineRunner, RunnerConfig, Step,
 )
 
 
@@ -146,5 +146,44 @@ def after_step(self,
         self.assertEqual(callback.after_calls, [("fail-step", "ValueError")])
 
 
+class CompositeCallbackTest(unittest.TestCase):
+
+    def test_callbacks_run_in_order_for_each_hook(self) -> None:
+        events: list[str] = []
+
+        class RecordingCallback(PipelineCallback):
+
+            def __init__(self, label: str) -> None:
+                self._label = label
+
+            def before_step(self, step: Step) -> None:
+                events.append(f"{self._label}:before:{step.name}")
+
+            def after_step(self,
+                           step: Step,
+                           *,
+                           error: Exception | None = None) -> None:
+                del error
+                events.append(f"{self._label}:after:{step.name}")
+
+        composite = CompositeCallback(
+            [RecordingCallback("first"),
+             RecordingCallback("second")])
+        step = _TrackingStep("composite", events)
+
+        composite.before_step(step)
+        composite.after_step(step)
+
+        self.assertEqual(
+            events,
+            [
+                "first:before:composite",
+                "second:before:composite",
+                "first:after:composite",
+                "second:after:composite",
+            ],
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index e49a8e19cb..43abda0612 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -20,7 +20,8 @@
 
 from absl import logging
 
-from tools.agentic_import.pipeline import PipelineAbort, PipelineCallback, Step
+from tools.agentic_import.pipeline import (CompositeCallback, PipelineAbort,
+                                           PipelineCallback, Step)
 from tools.agentic_import.state_handler import StateHandler, StepState
 
 
@@ -30,6 +31,20 @@ def _format_time(value: datetime) -> str:
     return value.isoformat()
 
 
+class InteractiveCallback(PipelineCallback):
+    """Prompts the user before each step runs."""
+
+    def before_step(self, step: Step) -> None:
+        preview = step.dry_run()
+        logging.info(f"Dry run for {step.name} (v{step.version}):")
+        if preview:
+            logging.info(preview)
+        prompt = f"Run step {step.name} (v{step.version})? [Y/n] "
+        response = input(prompt).strip().lower()
+        if response in ("n", "no"):
+            raise PipelineAbort("user declined interactive prompt")
+
+
 class JSONStateCallback(PipelineCallback):
     """Persists pipeline progress to the SDMX state file via StateHandler.
 
@@ -89,3 +104,24 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None:
 
     def _now(self) -> datetime:
         return self._now_fn()
+
+
+def build_pipeline_callback(
+    *,
+    state_handler: StateHandler,
+    run_id: str,
+    critical_input_hash: str,
+    command: str,
+    skip_confirmation: bool,
+    now_fn: Callable[[], datetime] | None = None,
+) -> PipelineCallback:
+    """Constructs the pipeline callback stack for the SDMX runner."""
+    json_callback = JSONStateCallback(state_handler=state_handler,
+                                      run_id=run_id,
+                                      critical_input_hash=critical_input_hash,
+                                      command=command,
+                                      now_fn=now_fn)
+    if skip_confirmation:
+        return json_callback
+    interactive = InteractiveCallback()
+    return CompositeCallback([interactive, json_callback])
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 259db3e979..7133d1b2b6 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -23,6 +23,9 @@
 import tempfile
 import unittest
 from datetime import datetime, timedelta, timezone
+from unittest import mock
+
+from absl import logging
 
 _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 _REPO_ROOT = os.path.dirname(_SCRIPT_DIR)
@@ -32,10 +35,11 @@
         sys.path.append(path)
 
 from tools.agentic_import.pipeline import (  # pylint: disable=import-error
-    BaseStep, Pipeline, PipelineAbort, PipelineRunner, RunnerConfig,
+    BaseStep, CompositeCallback, Pipeline, PipelineAbort, PipelineRunner,
+    RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
-    JSONStateCallback,)
+    InteractiveCallback, JSONStateCallback, build_pipeline_callback)
 from tools.agentic_import.state_handler import StateHandler  # pylint: disable=import-error
 
 
@@ -180,5 +184,83 @@ def dry_run(self) -> str:
         self.assertEqual(state, previous)
 
 
+class InteractiveCallbackTest(unittest.TestCase):
+
+    def test_prompt_accepts_and_runs_step(self) -> None:
+        callback = InteractiveCallback()
+        pipeline = Pipeline(steps=[_RecordingStep("download.preview")])
+        runner = PipelineRunner(RunnerConfig())
+
+        with mock.patch("builtins.input", return_value="y"):
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                runner.run(pipeline, callback)
+
+        self.assertTrue(
+            any("Dry run for download.preview" in entry
+                for entry in logs.output))
+
+    def test_prompt_decline_aborts_pipeline(self) -> None:
+        events: list[str] = []
+
+        class _TrackingStep(_RecordingStep):
+
+            def __init__(self) -> None:
+                super().__init__("sample.interactive")
+                self.executed = False
+
+            def run(self) -> None:
+                self.executed = True
+                super().run()
+
+            def dry_run(self) -> str:
+                events.append("dry_run")
+                return "noop"
+
+        callback = InteractiveCallback()
+        step = _TrackingStep()
+        pipeline = Pipeline(steps=[step])
+        runner = PipelineRunner(RunnerConfig())
+
+        with mock.patch("builtins.input", return_value="n"):
+            with self.assertLogs(logging.get_absl_logger(), level="INFO"):
+                runner.run(pipeline, callback)
+
+        self.assertFalse(step.executed)
+        self.assertTrue(events)
+
+
+class CallbackFactoryTest(unittest.TestCase):
+
+    def _state_handler_for_tmpdir(self, tmpdir: str) -> StateHandler:
+        path = os.path.join(tmpdir, ".datacommons", "demo.state.json")
+        return StateHandler(state_path=path, dataset_prefix="demo")
+
+    def test_skip_confirmation_returns_json_callback(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            handler = self._state_handler_for_tmpdir(tmpdir)
+            callback = build_pipeline_callback(
+                state_handler=handler,
+                run_id="demo",
+                critical_input_hash="abc",
+                command="python run",
+                skip_confirmation=True,
+            )
+        self.assertIsInstance(callback, JSONStateCallback)
+
+    def test_interactive_mode_returns_composite(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            handler = self._state_handler_for_tmpdir(tmpdir)
+            with mock.patch("builtins.input", return_value="y"):
+                callback = build_pipeline_callback(
+                    state_handler=handler,
+                    run_id="demo",
+                    critical_input_hash="abc",
+                    command="python run",
+                    skip_confirmation=False,
+                )
+        self.assertIsInstance(callback, CompositeCallback)
+
+
 if __name__ == "__main__":
     unittest.main()

From cb78e3813214145a5653c9875b7cb4fb77d91671 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 11 Nov 2025 07:18:09 +0000
Subject: [PATCH 06/54] Log dry-run previews instead of returning text Adjust
 steps, tests, and the interactive callback for None results

---
 tools/agentic_import/pipeline.py                  |  4 ++--
 tools/agentic_import/pipeline_test.py             |  8 ++++----
 tools/agentic_import/sdmx_import_pipeline.py      |  4 +---
 tools/agentic_import/sdmx_import_pipeline_test.py | 12 ++++++------
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index ec3d1b2457..6c489e89a4 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -45,8 +45,8 @@ def run(self) -> None:
         """Execute the step."""
 
     @abc.abstractmethod
-    def dry_run(self) -> str:
-        """Return a read-only preview of the work to be done."""
+    def dry_run(self) -> None:
+        """Log a read-only preview of the work to be done."""
 
 
 class BaseStep(Step, abc.ABC):
diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index 67c25ada12..ee19777ef3 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -39,8 +39,8 @@ def run(self) -> None:
         self.executed = True
         self._events.append(f"run:{self.name}")
 
-    def dry_run(self) -> str:
-        return ""
+    def dry_run(self) -> None:
+        return None
 
 
 class _FailingStep(BaseStep):
@@ -51,8 +51,8 @@ def __init__(self, *, name: str, version: int) -> None:
     def run(self) -> None:
         raise ValueError("boom")
 
-    def dry_run(self) -> str:
-        return ""
+    def dry_run(self) -> None:
+        return None
 
 
 class PipelineRunnerTest(unittest.TestCase):
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 43abda0612..386e56bb6a 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -35,10 +35,8 @@ class InteractiveCallback(PipelineCallback):
     """Prompts the user before each step runs."""
 
     def before_step(self, step: Step) -> None:
-        preview = step.dry_run()
         logging.info(f"Dry run for {step.name} (v{step.version}):")
-        if preview:
-            logging.info(preview)
+        step.dry_run()
         prompt = f"Run step {step.name} (v{step.version})? [Y/n] "
         response = input(prompt).strip().lower()
         if response in ("n", "no"):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 7133d1b2b6..a48e1b153b 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -68,8 +68,8 @@ def run(self) -> None:
         if self._should_fail:
             raise ValueError("boom")
 
-    def dry_run(self) -> str:
-        return "noop"
+    def dry_run(self) -> None:
+        logging.info("noop")
 
 
 class JSONStateCallbackTest(unittest.TestCase):
@@ -171,8 +171,8 @@ def __init__(self) -> None:
                 def run(self) -> None:
                     raise PipelineAbort("user requested stop")
 
-                def dry_run(self) -> str:
-                    return "noop"
+                def dry_run(self) -> None:
+                    logging.info("noop")
 
             pipeline = Pipeline(steps=[_AbortStep()])
             runner = PipelineRunner(RunnerConfig())
@@ -213,9 +213,9 @@ def run(self) -> None:
                 self.executed = True
                 super().run()
 
-            def dry_run(self) -> str:
+            def dry_run(self) -> None:
                 events.append("dry_run")
-                return "noop"
+                logging.info("noop")
 
         callback = InteractiveCallback()
         step = _TrackingStep()

From 746e1305a5b4f83508380e0446eae8f0842defc3 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 11 Nov 2025 11:59:43 +0000
Subject: [PATCH 07/54] Add SDMX pipeline planning

---
 tools/agentic_import/sdmx_import_pipeline.py  | 147 +++++++++++++++++-
 .../sdmx_import_pipeline_test.py              | 140 ++++++++++++++++-
 2 files changed, 281 insertions(+), 6 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 386e56bb6a..a0d74f797d 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -15,14 +15,17 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
 from datetime import datetime, timezone
-from typing import Callable
+from typing import Callable, Sequence
 
 from absl import logging
 
-from tools.agentic_import.pipeline import (CompositeCallback, PipelineAbort,
-                                           PipelineCallback, Step)
-from tools.agentic_import.state_handler import StateHandler, StepState
+from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
+                                           PipelineAbort, PipelineCallback,
+                                           Step)
+from tools.agentic_import.state_handler import (PipelineState, StateHandler,
+                                                StepState)
 
 
 def _format_time(value: datetime) -> str:
@@ -123,3 +126,139 @@ def build_pipeline_callback(
         return json_callback
     interactive = InteractiveCallback()
     return CompositeCallback([interactive, json_callback])
+
+
+@dataclass(frozen=True)
+class SdmxPipelineConfig:
+    """User-configurable inputs that mimic planned CLI flags.
+
+    This is a lightweight container; CLI parsing will be added in a later
+    phase. Defaults are intentionally minimal.
+    """
+
+    endpoint: str | None = None
+    agency: str | None = None
+    dataflow: str | None = None
+    key: str | None = None
+    dataset_prefix: str | None = None
+    working_dir: str | None = None
+    run_only: str | None = None
+    force: bool = False
+    verbose: bool = False
+    skip_confirmation: bool = False
+
+
+class SdmxStep(Step):
+    """Base class for SDMX steps that carries immutable config and version."""
+
+    def __init__(self, *, name: str, version: int,
+                 config: SdmxPipelineConfig) -> None:
+        if not name:
+            raise ValueError("step requires a name")
+        self._name = name
+        self._version = version
+        self._config = config
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def version(self) -> int:
+        return self._version
+
+    # Subclasses must implement run() and dry_run().
+
+
+@dataclass(frozen=True)
+class StepSpec:
+    phase: str
+    name: str
+    version: int
+    factory: Callable[[SdmxPipelineConfig], Step]
+
+    @property
+    def full_name(self) -> str:
+        return f"{self.phase}.{self.name}"
+
+
+@dataclass(frozen=True)
+class PhaseSpec:
+    name: str
+    steps: Sequence[StepSpec]
+
+
+@dataclass(frozen=True)
+class SdmxPhaseRegistry:
+    phases: Sequence[PhaseSpec]
+
+    def flatten(self) -> list[StepSpec]:
+        flattened: list[StepSpec] = []
+        for phase in self.phases:
+            flattened.extend(phase.steps)
+        return flattened
+
+
+class SdmxPipelineBuilder:
+
+    def __init__(self, *, config: SdmxPipelineConfig, state: PipelineState,
+                 registry: SdmxPhaseRegistry) -> None:
+        self._config = config
+        self._state = state
+        self._registry = registry
+        self._specs = registry.flatten()
+
+    def build(self) -> Pipeline:
+        planned = self._plan_steps()
+        steps = [spec.factory(self._config) for spec in planned]
+        logging.info("Built SDMX pipeline with %d steps", len(steps))
+        return Pipeline(steps=steps)
+
+    def _plan_steps(self) -> list[StepSpec]:
+        specs = self._select_specs(self._specs, self._config.run_only)
+        if not specs:
+            return []
+        force_all = bool(self._config.force and not self._config.run_only)
+        if force_all:
+            return list(specs)
+        scheduled: list[StepSpec] = []
+        downstream = False
+        for spec in specs:
+            needs_run = self._should_run(spec)
+            if needs_run and not downstream:
+                downstream = True
+            if downstream:
+                scheduled.append(spec)
+        if not scheduled:
+            logging.info("No steps scheduled; all steps current")
+        return scheduled
+
+    def _select_specs(self, specs: Sequence[StepSpec],
+                      run_only: str | None) -> list[StepSpec]:
+        if not run_only:
+            return list(specs)
+        if "." in run_only:
+            scoped = [s for s in specs if s.full_name == run_only]
+            if not scoped:
+                raise ValueError(f"run_only target not found: {run_only}")
+            return scoped
+        scoped = [s for s in specs if s.phase == run_only]
+        if not scoped:
+            raise ValueError(f"run_only phase not found: {run_only}")
+        return scoped
+
+    def _should_run(self, spec: StepSpec) -> bool:
+        prev = self._state.steps.get(spec.full_name)
+        if prev is None:
+            return True
+        if prev.status != "succeeded":
+            return True
+        if prev.version < spec.version:
+            return True
+        return False
+
+
+def build_sdmx_pipeline(*, config: SdmxPipelineConfig, state: PipelineState,
+                        registry: SdmxPhaseRegistry) -> Pipeline:
+    builder = SdmxPipelineBuilder(config=config, state=state, registry=registry)
+    return builder.build()
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index a48e1b153b..3c8f4690f6 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -39,8 +39,11 @@
     RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
-    InteractiveCallback, JSONStateCallback, build_pipeline_callback)
-from tools.agentic_import.state_handler import StateHandler  # pylint: disable=import-error
+    InteractiveCallback, JSONStateCallback, SdmxPipelineBuilder,
+    SdmxPipelineConfig, SdmxPhaseRegistry, PhaseSpec, StepSpec, SdmxStep,
+    build_pipeline_callback, build_sdmx_pipeline)
+from tools.agentic_import.state_handler import (  # pylint: disable=import-error
+    PipelineState, StateHandler, StepState)
 
 
 class _IncrementingClock:
@@ -262,5 +265,138 @@ def test_interactive_mode_returns_composite(self) -> None:
         self.assertIsInstance(callback, CompositeCallback)
 
 
+class _TestStep(SdmxStep):
+
+    def run(self) -> None:
+        pass
+
+    def dry_run(self) -> None:
+        logging.info("noop")
+
+
+class PlanningTest(unittest.TestCase):
+
+    def _mk_spec(self, phase: str, name: str, version: int) -> StepSpec:
+        full = f"{phase}.{name}"
+
+        def _factory(cfg: SdmxPipelineConfig) -> _TestStep:
+            return _TestStep(name=full, version=version, config=cfg)
+
+        return StepSpec(phase=phase,
+                        name=name,
+                        version=version,
+                        factory=_factory)
+
+    def _mk_registry(self) -> SdmxPhaseRegistry:
+        download = PhaseSpec(
+            name="download",
+            steps=[
+                self._mk_spec("download", "fetch", 1),
+                self._mk_spec("download", "preview", 1)
+            ],
+        )
+        process = PhaseSpec(
+            name="process",
+            steps=[self._mk_spec("process", "clean", 1)],
+        )
+        export = PhaseSpec(
+            name="export",
+            steps=[self._mk_spec("export", "write", 1)],
+        )
+        return SdmxPhaseRegistry(phases=[download, process, export])
+
+    def _empty_state(self) -> PipelineState:
+        return PipelineState(run_id="demo",
+                             critical_input_hash="",
+                             command="",
+                             updated_at="",
+                             steps={})
+
+    def _state_with(self, versions: dict[str, tuple[int,
+                                                    str]]) -> PipelineState:
+        steps = {
+            name:
+                StepState(version=v,
+                          status=st,
+                          started_at="t",
+                          ended_at="t",
+                          duration_s=0.0) for name, (v, st) in versions.items()
+        }
+        return PipelineState(run_id="demo",
+                             critical_input_hash="",
+                             command="",
+                             updated_at="",
+                             steps=steps)
+
+    def _names_from_builder(self,
+                            cfg: SdmxPipelineConfig,
+                            reg: SdmxPhaseRegistry,
+                            state: PipelineState | None = None) -> list[str]:
+        builder = SdmxPipelineBuilder(config=cfg,
+                                      state=state or self._empty_state(),
+                                      registry=reg)
+        pipeline = builder.build()
+        return [step.name for step in pipeline.get_steps()]
+
+    def test_run_only_phase_and_step(self) -> None:
+        reg = self._mk_registry()
+        cfg_phase = SdmxPipelineConfig(run_only="download")
+        names_phase = self._names_from_builder(cfg_phase, reg)
+        self.assertEqual(names_phase, ["download.fetch", "download.preview"])
+
+        cfg_step = SdmxPipelineConfig(run_only="download.fetch")
+        names_step = self._names_from_builder(cfg_step, reg)
+        self.assertEqual(names_step, ["download.fetch"])
+
+        with self.assertRaisesRegex(ValueError, "run_only phase not found"):
+            self._names_from_builder(SdmxPipelineConfig(run_only="nope"), reg)
+        with self.assertRaisesRegex(ValueError, "run_only target not found"):
+            self._names_from_builder(
+                SdmxPipelineConfig(run_only="download.nope"), reg)
+
+    def test_force_semantics(self) -> None:
+        reg = self._mk_registry()
+        cfg_all = SdmxPipelineConfig(force=True)
+        names_all = self._names_from_builder(cfg_all, reg)
+        self.assertEqual(names_all, [
+            "download.fetch",
+            "download.preview",
+            "process.clean",
+            "export.write",
+        ])
+
+        cfg_phase = SdmxPipelineConfig(run_only="download", force=True)
+        names_phase = self._names_from_builder(cfg_phase, reg)
+        self.assertEqual(names_phase, ["download.fetch", "download.preview"])
+
+    def test_version_bump_schedules_downstream(self) -> None:
+        # Make process.clean a new version while others remain the same.
+        download = PhaseSpec(
+            name="download",
+            steps=[self._mk_spec("download", "fetch", 1)],
+        )
+        process = PhaseSpec(
+            name="process",
+            steps=[self._mk_spec("process", "clean", 2)],
+        )
+        export = PhaseSpec(
+            name="export",
+            steps=[self._mk_spec("export", "write", 1)],
+        )
+        reg = SdmxPhaseRegistry(phases=[download, process, export])
+        state = self._state_with({
+            "download.fetch": (1, "succeeded"),
+            "process.clean": (1, "succeeded"),
+            "export.write": (1, "succeeded"),
+        })
+        cfg = SdmxPipelineConfig()
+        names = self._names_from_builder(cfg, reg, state)
+        self.assertEqual(names, ["process.clean", "export.write"])
+
+        pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg)
+        self.assertEqual([s.name for s in pipeline.get_steps()],
+                         ["process.clean", "export.write"])
+
+
 if __name__ == "__main__":
     unittest.main()

From 561d61cc4926ec43df7ab70c57d3be33c6a91024 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 11 Nov 2025 14:41:41 +0000
Subject: [PATCH 08/54] Generalize SDMX pipeline naming Add placeholder steps
 for download/process/config phases Refresh tests for the renamed builder API

---
 tools/agentic_import/sdmx_import_pipeline.py  | 118 ++++++++++++++++--
 .../sdmx_import_pipeline_test.py              |  40 +++---
 2 files changed, 128 insertions(+), 30 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index a0d74f797d..83b71d3ec1 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -129,7 +129,7 @@ def build_pipeline_callback(
 
 
 @dataclass(frozen=True)
-class SdmxPipelineConfig:
+class PipelineConfig:
     """User-configurable inputs that mimic planned CLI flags.
 
     This is a lightweight container; CLI parsing will be added in a later
@@ -152,7 +152,7 @@ class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
     def __init__(self, *, name: str, version: int,
-                 config: SdmxPipelineConfig) -> None:
+                 config: PipelineConfig) -> None:
         if not name:
             raise ValueError("step requires a name")
         self._name = name
@@ -170,12 +170,110 @@ def version(self) -> int:
     # Subclasses must implement run() and dry_run().
 
 
+class DownloadDataStep(SdmxStep):
+    """Downloads SDMX data payloads."""
+
+    VERSION = 1
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+
+    def run(self) -> None:
+        logging.info(
+            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+
+    def dry_run(self) -> None:
+        logging.info(f"{self.name} (dry run): previewing data download inputs")
+
+
+class DownloadMetadataStep(SdmxStep):
+    """Downloads SDMX metadata payloads."""
+
+    VERSION = 1
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+
+    def run(self) -> None:
+        logging.info(
+            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+
+    def dry_run(self) -> None:
+        logging.info(
+            f"{self.name} (dry run): previewing metadata download inputs")
+
+
+class CreateSampleStep(SdmxStep):
+    """Creates a sample dataset from downloaded data."""
+
+    VERSION = 1
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+
+    def run(self) -> None:
+        logging.info(
+            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+
+    def dry_run(self) -> None:
+        logging.info(f"{self.name} (dry run): previewing sample generation")
+
+
+class CreateSchemaMapStep(SdmxStep):
+    """Builds schema mappings for transformed data."""
+
+    VERSION = 1
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+
+    def run(self) -> None:
+        logging.info(
+            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+
+    def dry_run(self) -> None:
+        logging.info(
+            f"{self.name} (dry run): previewing schema mapping outputs")
+
+
+class ProcessFullDataStep(SdmxStep):
+    """Processes full SDMX data into DC artifacts."""
+
+    VERSION = 1
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+
+    def run(self) -> None:
+        logging.info(
+            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+
+    def dry_run(self) -> None:
+        logging.info(f"{self.name} (dry run): previewing full-data processing")
+
+
+class CreateDcConfigStep(SdmxStep):
+    """Generates Datacommons configuration artifacts."""
+
+    VERSION = 1
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+
+    def run(self) -> None:
+        logging.info(
+            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+
+    def dry_run(self) -> None:
+        logging.info(f"{self.name} (dry run): previewing DC config creation")
+
+
 @dataclass(frozen=True)
 class StepSpec:
     phase: str
     name: str
     version: int
-    factory: Callable[[SdmxPipelineConfig], Step]
+    factory: Callable[[PipelineConfig], Step]
 
     @property
     def full_name(self) -> str:
@@ -189,7 +287,7 @@ class PhaseSpec:
 
 
 @dataclass(frozen=True)
-class SdmxPhaseRegistry:
+class PhaseRegistry:
     phases: Sequence[PhaseSpec]
 
     def flatten(self) -> list[StepSpec]:
@@ -199,10 +297,10 @@ def flatten(self) -> list[StepSpec]:
         return flattened
 
 
-class SdmxPipelineBuilder:
+class PipelineBuilder:
 
-    def __init__(self, *, config: SdmxPipelineConfig, state: PipelineState,
-                 registry: SdmxPhaseRegistry) -> None:
+    def __init__(self, *, config: PipelineConfig, state: PipelineState,
+                 registry: PhaseRegistry) -> None:
         self._config = config
         self._state = state
         self._registry = registry
@@ -258,7 +356,7 @@ def _should_run(self, spec: StepSpec) -> bool:
         return False
 
 
-def build_sdmx_pipeline(*, config: SdmxPipelineConfig, state: PipelineState,
-                        registry: SdmxPhaseRegistry) -> Pipeline:
-    builder = SdmxPipelineBuilder(config=config, state=state, registry=registry)
+def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState,
+                        registry: PhaseRegistry) -> Pipeline:
+    builder = PipelineBuilder(config=config, state=state, registry=registry)
     return builder.build()
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 3c8f4690f6..8e023e6f0d 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -39,9 +39,9 @@
     RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
-    InteractiveCallback, JSONStateCallback, SdmxPipelineBuilder,
-    SdmxPipelineConfig, SdmxPhaseRegistry, PhaseSpec, StepSpec, SdmxStep,
-    build_pipeline_callback, build_sdmx_pipeline)
+    InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
+    PhaseRegistry, PhaseSpec, StepSpec, SdmxStep, build_pipeline_callback,
+    build_sdmx_pipeline)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -279,7 +279,7 @@ class PlanningTest(unittest.TestCase):
     def _mk_spec(self, phase: str, name: str, version: int) -> StepSpec:
         full = f"{phase}.{name}"
 
-        def _factory(cfg: SdmxPipelineConfig) -> _TestStep:
+        def _factory(cfg: PipelineConfig) -> _TestStep:
             return _TestStep(name=full, version=version, config=cfg)
 
         return StepSpec(phase=phase,
@@ -287,7 +287,7 @@ def _factory(cfg: SdmxPipelineConfig) -> _TestStep:
                         version=version,
                         factory=_factory)
 
-    def _mk_registry(self) -> SdmxPhaseRegistry:
+    def _mk_registry(self) -> PhaseRegistry:
         download = PhaseSpec(
             name="download",
             steps=[
@@ -303,7 +303,7 @@ def _mk_registry(self) -> SdmxPhaseRegistry:
             name="export",
             steps=[self._mk_spec("export", "write", 1)],
         )
-        return SdmxPhaseRegistry(phases=[download, process, export])
+        return PhaseRegistry(phases=[download, process, export])
 
     def _empty_state(self) -> PipelineState:
         return PipelineState(run_id="demo",
@@ -329,34 +329,34 @@ def _state_with(self, versions: dict[str, tuple[int,
                              steps=steps)
 
     def _names_from_builder(self,
-                            cfg: SdmxPipelineConfig,
-                            reg: SdmxPhaseRegistry,
+                            cfg: PipelineConfig,
+                            reg: PhaseRegistry,
                             state: PipelineState | None = None) -> list[str]:
-        builder = SdmxPipelineBuilder(config=cfg,
-                                      state=state or self._empty_state(),
-                                      registry=reg)
+        builder = PipelineBuilder(config=cfg,
+                                  state=state or self._empty_state(),
+                                  registry=reg)
         pipeline = builder.build()
         return [step.name for step in pipeline.get_steps()]
 
     def test_run_only_phase_and_step(self) -> None:
         reg = self._mk_registry()
-        cfg_phase = SdmxPipelineConfig(run_only="download")
+        cfg_phase = PipelineConfig(run_only="download")
         names_phase = self._names_from_builder(cfg_phase, reg)
         self.assertEqual(names_phase, ["download.fetch", "download.preview"])
 
-        cfg_step = SdmxPipelineConfig(run_only="download.fetch")
+        cfg_step = PipelineConfig(run_only="download.fetch")
         names_step = self._names_from_builder(cfg_step, reg)
         self.assertEqual(names_step, ["download.fetch"])
 
         with self.assertRaisesRegex(ValueError, "run_only phase not found"):
-            self._names_from_builder(SdmxPipelineConfig(run_only="nope"), reg)
+            self._names_from_builder(PipelineConfig(run_only="nope"), reg)
         with self.assertRaisesRegex(ValueError, "run_only target not found"):
-            self._names_from_builder(
-                SdmxPipelineConfig(run_only="download.nope"), reg)
+            self._names_from_builder(PipelineConfig(run_only="download.nope"),
+                                     reg)
 
     def test_force_semantics(self) -> None:
         reg = self._mk_registry()
-        cfg_all = SdmxPipelineConfig(force=True)
+        cfg_all = PipelineConfig(force=True)
         names_all = self._names_from_builder(cfg_all, reg)
         self.assertEqual(names_all, [
             "download.fetch",
@@ -365,7 +365,7 @@ def test_force_semantics(self) -> None:
             "export.write",
         ])
 
-        cfg_phase = SdmxPipelineConfig(run_only="download", force=True)
+        cfg_phase = PipelineConfig(run_only="download", force=True)
         names_phase = self._names_from_builder(cfg_phase, reg)
         self.assertEqual(names_phase, ["download.fetch", "download.preview"])
 
@@ -383,13 +383,13 @@ def test_version_bump_schedules_downstream(self) -> None:
             name="export",
             steps=[self._mk_spec("export", "write", 1)],
         )
-        reg = SdmxPhaseRegistry(phases=[download, process, export])
+        reg = PhaseRegistry(phases=[download, process, export])
         state = self._state_with({
             "download.fetch": (1, "succeeded"),
             "process.clean": (1, "succeeded"),
             "export.write": (1, "succeeded"),
         })
-        cfg = SdmxPipelineConfig()
+        cfg = PipelineConfig()
         names = self._names_from_builder(cfg, reg, state)
         self.assertEqual(names, ["process.clean", "export.write"])
 

From 0cfcc217078e45afd78bf0294019689185e7a852 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 11 Nov 2025 16:35:31 +0000
Subject: [PATCH 09/54] Use explicit SDMX registry builder

Update pipeline builder and tests to rely on the canonical registry and step names
---
 tools/agentic_import/sdmx_import_pipeline.py  |  48 +++++++-
 .../sdmx_import_pipeline_test.py              | 115 ++++++++----------
 2 files changed, 96 insertions(+), 67 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 83b71d3ec1..7264395630 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -304,7 +304,7 @@ def __init__(self, *, config: PipelineConfig, state: PipelineState,
         self._config = config
         self._state = state
         self._registry = registry
-        self._specs = registry.flatten()
+        self._specs = self._registry.flatten()
 
     def build(self) -> Pipeline:
         planned = self._plan_steps()
@@ -356,7 +356,49 @@ def _should_run(self, spec: StepSpec) -> bool:
         return False
 
 
-def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState,
-                        registry: PhaseRegistry) -> Pipeline:
+
+
+
+def build_registry() -> PhaseRegistry:
+    """Constructs the hard-coded Phase 2 registry with canonical steps."""
+    def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec:
+        full = f"{phase}.{step}"
+        return StepSpec(
+            phase=phase,
+            name=step,
+            version=cls.VERSION,
+            factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name, config=cfg),
+        )
+
+    download = PhaseSpec(
+        name="download",
+        steps=[
+            _spec("download", "download-data", DownloadDataStep),
+            _spec("download", "download-metadata", DownloadMetadataStep),
+        ],
+    )
+    sample = PhaseSpec(
+        name="sample",
+        steps=[
+            _spec("sample", "create-sample", CreateSampleStep),
+        ],
+    )
+    schema_map = PhaseSpec(
+        name="schema_map",
+        steps=[
+            _spec("schema_map", "create-schema-mapping", CreateSchemaMapStep),
+        ],
+    )
+    transform = PhaseSpec(
+        name="transform",
+        steps=[
+            _spec("transform", "process-full-data", ProcessFullDataStep),
+            _spec("transform", "create-dc-config", CreateDcConfigStep),
+        ],
+    )
+    return PhaseRegistry(phases=[download, sample, schema_map, transform])
+
+def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, registry: PhaseRegistry) -> Pipeline:
     builder = PipelineBuilder(config=config, state=state, registry=registry)
     return builder.build()
+
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 8e023e6f0d..3565c4ed61 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -39,9 +39,10 @@
     RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
+    CreateDcConfigStep, DownloadDataStep, ProcessFullDataStep,
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     PhaseRegistry, PhaseSpec, StepSpec, SdmxStep, build_pipeline_callback,
-    build_sdmx_pipeline)
+    build_registry, build_sdmx_pipeline)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -265,45 +266,10 @@ def test_interactive_mode_returns_composite(self) -> None:
         self.assertIsInstance(callback, CompositeCallback)
 
 
-class _TestStep(SdmxStep):
-
-    def run(self) -> None:
-        pass
-
-    def dry_run(self) -> None:
-        logging.info("noop")
-
-
 class PlanningTest(unittest.TestCase):
 
-    def _mk_spec(self, phase: str, name: str, version: int) -> StepSpec:
-        full = f"{phase}.{name}"
-
-        def _factory(cfg: PipelineConfig) -> _TestStep:
-            return _TestStep(name=full, version=version, config=cfg)
-
-        return StepSpec(phase=phase,
-                        name=name,
-                        version=version,
-                        factory=_factory)
-
     def _mk_registry(self) -> PhaseRegistry:
-        download = PhaseSpec(
-            name="download",
-            steps=[
-                self._mk_spec("download", "fetch", 1),
-                self._mk_spec("download", "preview", 1)
-            ],
-        )
-        process = PhaseSpec(
-            name="process",
-            steps=[self._mk_spec("process", "clean", 1)],
-        )
-        export = PhaseSpec(
-            name="export",
-            steps=[self._mk_spec("export", "write", 1)],
-        )
-        return PhaseRegistry(phases=[download, process, export])
+        return build_registry()
 
     def _empty_state(self) -> PipelineState:
         return PipelineState(run_id="demo",
@@ -342,11 +308,11 @@ def test_run_only_phase_and_step(self) -> None:
         reg = self._mk_registry()
         cfg_phase = PipelineConfig(run_only="download")
         names_phase = self._names_from_builder(cfg_phase, reg)
-        self.assertEqual(names_phase, ["download.fetch", "download.preview"])
+        self.assertEqual(names_phase, ["download.download-data", "download.download-metadata"])
 
-        cfg_step = PipelineConfig(run_only="download.fetch")
+        cfg_step = PipelineConfig(run_only="download.download-data")
         names_step = self._names_from_builder(cfg_step, reg)
-        self.assertEqual(names_step, ["download.fetch"])
+        self.assertEqual(names_step, ["download.download-data"])
 
         with self.assertRaisesRegex(ValueError, "run_only phase not found"):
             self._names_from_builder(PipelineConfig(run_only="nope"), reg)
@@ -359,43 +325,64 @@ def test_force_semantics(self) -> None:
         cfg_all = PipelineConfig(force=True)
         names_all = self._names_from_builder(cfg_all, reg)
         self.assertEqual(names_all, [
-            "download.fetch",
-            "download.preview",
-            "process.clean",
-            "export.write",
+            "download.download-data",
+            "download.download-metadata",
+            "sample.create-sample",
+            "schema_map.create-schema-mapping",
+            "transform.process-full-data",
+            "transform.create-dc-config",
         ])
 
         cfg_phase = PipelineConfig(run_only="download", force=True)
         names_phase = self._names_from_builder(cfg_phase, reg)
-        self.assertEqual(names_phase, ["download.fetch", "download.preview"])
+        self.assertEqual(names_phase,
+                         ["download.download-data", "download.download-metadata"])
 
     def test_version_bump_schedules_downstream(self) -> None:
-        # Make process.clean a new version while others remain the same.
-        download = PhaseSpec(
-            name="download",
-            steps=[self._mk_spec("download", "fetch", 1)],
-        )
-        process = PhaseSpec(
-            name="process",
-            steps=[self._mk_spec("process", "clean", 2)],
-        )
-        export = PhaseSpec(
-            name="export",
-            steps=[self._mk_spec("export", "write", 1)],
-        )
-        reg = PhaseRegistry(phases=[download, process, export])
+        reg = PhaseRegistry(phases=[
+            PhaseSpec(
+                name="download",
+                steps=[
+                    StepSpec(
+                        phase="download",
+                        name="download-data",
+                        version=1,
+                        factory=lambda cfg: DownloadDataStep(
+                            name="download.download-data", config=cfg)),
+                ]
+            ),
+            PhaseSpec(
+                name="transform",
+                steps=[
+                    StepSpec(
+                        phase="transform",
+                        name="process-full-data",
+                        version=2,
+                        factory=lambda cfg: ProcessFullDataStep(
+                            name="transform.process-full-data", config=cfg)),
+                    StepSpec(
+                        phase="transform",
+                        name="create-dc-config",
+                        version=1,
+                        factory=lambda cfg: CreateDcConfigStep(
+                            name="transform.create-dc-config", config=cfg)),
+                ]
+            )
+        ])
         state = self._state_with({
-            "download.fetch": (1, "succeeded"),
-            "process.clean": (1, "succeeded"),
-            "export.write": (1, "succeeded"),
+            "download.download-data": (1, "succeeded"),
+            "transform.process-full-data": (1, "succeeded"),
+            "transform.create-dc-config": (1, "succeeded"),
         })
         cfg = PipelineConfig()
         names = self._names_from_builder(cfg, reg, state)
-        self.assertEqual(names, ["process.clean", "export.write"])
+        self.assertEqual(names, [
+            "transform.process-full-data", "transform.create-dc-config"
+        ])
 
         pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg)
         self.assertEqual([s.name for s in pipeline.get_steps()],
-                         ["process.clean", "export.write"])
+                         ["transform.process-full-data", "transform.create-dc-config"])
 
 
 if __name__ == "__main__":

From 8d3525ad11b9039434fd315c4ac9e50d1c44d745 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Wed, 12 Nov 2025 05:04:28 +0000
Subject: [PATCH 10/54] feat(sdmx-import): Add timestamps and improve pipeline
 planning

This commit introduces two main improvements to the SDMX import pipeline:

1.  **Timestamps in State:** Adds  and  (millisecond-precision Unix timestamps) to both the individual step state and the overall pipeline state. This allows for more precise and reliable comparisons of step execution times.

2.  **Smarter Pipeline Planning:** The pipeline planning logic is now more robust. It will now re-run a step if any of its preceding steps have been executed more recently, even if the step itself is marked as succeeded. This ensures that downstream steps are always up-to-date with their dependencies.

This commit also includes:
- Refactoring of the  step selection logic for clarity.
- Addition of a  helper function.
- Updated tests to reflect these changes.
---
 tools/agentic_import/sdmx_import_pipeline.py  |  83 +++++++----
 .../sdmx_import_pipeline_test.py              | 139 ++++++++++++------
 tools/agentic_import/state_handler.py         |   4 +
 tools/agentic_import/state_handler_test.py    |   2 +
 4 files changed, 156 insertions(+), 72 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 7264395630..3e757e0fc3 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -80,6 +80,8 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None:
         if started_at is None:
             started_at = ended_at
         duration = max(0.0, (ended_at - started_at).total_seconds())
+        started_at_ts = int(started_at.timestamp() * 1000)
+        ended_at_ts = int(ended_at.timestamp() * 1000)
         if isinstance(error, PipelineAbort):
             logging.info(
                 f"Skipping state update for {step.name} due to pipeline abort")
@@ -97,10 +99,13 @@ def after_step(self, step: Step, *, error: Exception | None = None) -> None:
             started_at=_format_time(started_at),
             ended_at=_format_time(ended_at),
             duration_s=duration,
+            started_at_ts=started_at_ts,
+            ended_at_ts=ended_at_ts,
             message=message,
         )
         self._state.steps[step.name] = step_state
         self._state.updated_at = step_state.ended_at
+        self._state.updated_at_ts = ended_at_ts
         self._handler.save_state()
 
     def _now(self) -> datetime:
@@ -277,7 +282,7 @@ class StepSpec:
 
     @property
     def full_name(self) -> str:
-        return f"{self.phase}.{self.name}"
+        return _format_full_name(self.phase, self.name)
 
 
 @dataclass(frozen=True)
@@ -313,36 +318,39 @@ def build(self) -> Pipeline:
         return Pipeline(steps=steps)
 
     def _plan_steps(self) -> list[StepSpec]:
-        specs = self._select_specs(self._specs, self._config.run_only)
-        if not specs:
-            return []
-        force_all = bool(self._config.force and not self._config.run_only)
-        if force_all:
-            return list(specs)
+        if self._config.run_only:
+            return self._filter_run_only(self._specs, self._config.run_only)
+        if self._config.force:
+            return list(self._specs)
         scheduled: list[StepSpec] = []
-        downstream = False
-        for spec in specs:
-            needs_run = self._should_run(spec)
-            if needs_run and not downstream:
-                downstream = True
-            if downstream:
+        schedule_all_remaining = False
+        previous: StepSpec | None = None
+        for spec in self._specs:
+            if schedule_all_remaining:
                 scheduled.append(spec)
+            else:
+                needs_run = self._should_run(spec)
+                if not needs_run and previous is not None:
+                    needs_run = self._predecessor_newer(previous, spec)
+                if needs_run:
+                    scheduled.append(spec)
+                    schedule_all_remaining = True
+            previous = spec
         if not scheduled:
-            logging.info("No steps scheduled; all steps current")
+            logging.info("No steps scheduled.")
         return scheduled
 
-    def _select_specs(self, specs: Sequence[StepSpec],
-                      run_only: str | None) -> list[StepSpec]:
-        if not run_only:
-            return list(specs)
-        if "." in run_only:
+    def _filter_run_only(self, specs: Sequence[StepSpec],
+                         run_only: str) -> list[StepSpec]:
+        is_step = "." in run_only
+        if is_step:
             scoped = [s for s in specs if s.full_name == run_only]
-            if not scoped:
-                raise ValueError(f"run_only target not found: {run_only}")
-            return scoped
-        scoped = [s for s in specs if s.phase == run_only]
+        else:
+            scoped = [s for s in specs if s.phase == run_only]
+
         if not scoped:
-            raise ValueError(f"run_only phase not found: {run_only}")
+            entity = "step" if is_step else "phase"
+            raise ValueError(f"run_only {entity} not found: {run_only}")
         return scoped
 
     def _should_run(self, spec: StepSpec) -> bool:
@@ -355,19 +363,31 @@ def _should_run(self, spec: StepSpec) -> bool:
             return True
         return False
 
-
-
+    def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool:
+        prev_state = self._state.steps.get(prev_spec.full_name)
+        curr_state = self._state.steps.get(spec.full_name)
+        if prev_state is None or prev_state.ended_at_ts is None:
+            return False
+        if curr_state is None:
+            return True
+        if curr_state.status != "succeeded":
+            return True
+        if curr_state.ended_at_ts is None:
+            return True
+        return prev_state.ended_at_ts > curr_state.ended_at_ts
 
 
 def build_registry() -> PhaseRegistry:
     """Constructs the hard-coded Phase 2 registry with canonical steps."""
+
     def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec:
-        full = f"{phase}.{step}"
+        full = _format_full_name(phase, step)
         return StepSpec(
             phase=phase,
             name=step,
             version=cls.VERSION,
-            factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name, config=cfg),
+            factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name,
+                                                               config=cfg),
         )
 
     download = PhaseSpec(
@@ -398,7 +418,12 @@ def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec:
     )
     return PhaseRegistry(phases=[download, sample, schema_map, transform])
 
-def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState, registry: PhaseRegistry) -> Pipeline:
+
+def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState,
+                        registry: PhaseRegistry) -> Pipeline:
     builder = PipelineBuilder(config=config, state=state, registry=registry)
     return builder.build()
 
+
+def _format_full_name(phase: str, step: str) -> str:
+    return f"{phase}.{step}"
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 3565c4ed61..0f26c61a7e 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -116,6 +116,13 @@ def test_successful_step_persists_expected_schema(self) -> None:
         self.assertIn("message", step_state)
         self.assertIsNone(step_state["message"])
         self.assertEqual(state["updated_at"], step_state["ended_at"])
+        ended_at_dt = datetime.fromisoformat(step_state["ended_at"])
+        started_at_dt = datetime.fromisoformat(step_state["started_at"])
+        self.assertEqual(step_state["ended_at_ts"],
+                         int(ended_at_dt.timestamp() * 1000))
+        self.assertEqual(step_state["started_at_ts"],
+                         int(started_at_dt.timestamp() * 1000))
+        self.assertEqual(state["updated_at_ts"], step_state["ended_at_ts"])
 
     def test_failed_step_records_error_and_persists_file(self) -> None:
         clock = _IncrementingClock(
@@ -138,6 +145,8 @@ def test_failed_step_records_error_and_persists_file(self) -> None:
         self.assertEqual(step_state["status"], "failed")
         self.assertIn("boom", step_state["message"])
         self.assertAlmostEqual(step_state["duration_s"], 7.0)
+        self.assertIn("ended_at_ts", step_state)
+        self.assertIn("started_at_ts", step_state)
 
     def test_abort_skips_state_persistence(self) -> None:
         clock = _IncrementingClock(
@@ -152,12 +161,15 @@ def test_abort_skips_state_persistence(self) -> None:
                 "critical_input_hash": "old",
                 "command": "old command",
                 "updated_at": "2025-01-01T00:00:00Z",
+                "updated_at_ts": 1,
                 "steps": {
                     "existing.step": {
                         "version": 1,
                         "status": "succeeded",
                         "started_at": "2025-01-01T00:00:00Z",
+                        "started_at_ts": 0,
                         "ended_at": "2025-01-01T00:05:00Z",
+                        "ended_at_ts": 300000,
                         "duration_s": 300.0,
                         "message": None,
                     }
@@ -278,20 +290,26 @@ def _empty_state(self) -> PipelineState:
                              updated_at="",
                              steps={})
 
-    def _state_with(self, versions: dict[str, tuple[int,
-                                                    str]]) -> PipelineState:
+    def _state_with(
+            self, versions: dict[str, tuple[int, str,
+                                            int | None]]) -> PipelineState:
         steps = {
             name:
                 StepState(version=v,
                           status=st,
                           started_at="t",
                           ended_at="t",
-                          duration_s=0.0) for name, (v, st) in versions.items()
+                          duration_s=0.0,
+                          started_at_ts=ts,
+                          ended_at_ts=ts,
+                          message=None)
+            for name, (v, st, ts) in versions.items()
         }
         return PipelineState(run_id="demo",
                              critical_input_hash="",
                              command="",
                              updated_at="",
+                             updated_at_ts=None,
                              steps=steps)
 
     def _names_from_builder(self,
@@ -308,7 +326,9 @@ def test_run_only_phase_and_step(self) -> None:
         reg = self._mk_registry()
         cfg_phase = PipelineConfig(run_only="download")
         names_phase = self._names_from_builder(cfg_phase, reg)
-        self.assertEqual(names_phase, ["download.download-data", "download.download-metadata"])
+        self.assertEqual(
+            names_phase,
+            ["download.download-data", "download.download-metadata"])
 
         cfg_step = PipelineConfig(run_only="download.download-data")
         names_step = self._names_from_builder(cfg_step, reg)
@@ -316,7 +336,7 @@ def test_run_only_phase_and_step(self) -> None:
 
         with self.assertRaisesRegex(ValueError, "run_only phase not found"):
             self._names_from_builder(PipelineConfig(run_only="nope"), reg)
-        with self.assertRaisesRegex(ValueError, "run_only target not found"):
+        with self.assertRaisesRegex(ValueError, "run_only step not found"):
             self._names_from_builder(PipelineConfig(run_only="download.nope"),
                                      reg)
 
@@ -335,54 +355,87 @@ def test_force_semantics(self) -> None:
 
         cfg_phase = PipelineConfig(run_only="download", force=True)
         names_phase = self._names_from_builder(cfg_phase, reg)
-        self.assertEqual(names_phase,
-                         ["download.download-data", "download.download-metadata"])
+        self.assertEqual(
+            names_phase,
+            ["download.download-data", "download.download-metadata"])
+
+    def test_timestamp_chaining_triggers_next_step(self) -> None:
+        reg = self._mk_registry()
+        newer = 2_000
+        older = 1_000
+        state = self._state_with({
+            "download.download-data": (1, "succeeded", newer),
+            "download.download-metadata": (1, "succeeded", older),
+            "sample.create-sample": (1, "succeeded", older),
+            "schema_map.create-schema-mapping": (1, "succeeded", older),
+            "transform.process-full-data": (1, "succeeded", older),
+            "transform.create-dc-config": (1, "succeeded", older),
+        })
+        cfg = PipelineConfig()
+        names = self._names_from_builder(cfg, reg, state)
+        self.assertEqual(names, [
+            "download.download-metadata",
+            "sample.create-sample",
+            "schema_map.create-schema-mapping",
+            "transform.process-full-data",
+            "transform.create-dc-config",
+        ])
+
+    def test_run_only_ignores_timestamp_chaining(self) -> None:
+        reg = self._mk_registry()
+        newer = 4_000
+        older = 3_000
+        state = self._state_with({
+            "download.download-data": (1, "succeeded", newer),
+            "download.download-metadata": (1, "succeeded", older),
+        })
+        cfg = PipelineConfig(run_only="download")
+        names = self._names_from_builder(cfg, reg, state)
+        self.assertEqual(
+            names, ["download.download-data", "download.download-metadata"])
 
     def test_version_bump_schedules_downstream(self) -> None:
         reg = PhaseRegistry(phases=[
-            PhaseSpec(
-                name="download",
-                steps=[
-                    StepSpec(
-                        phase="download",
-                        name="download-data",
-                        version=1,
-                        factory=lambda cfg: DownloadDataStep(
-                            name="download.download-data", config=cfg)),
-                ]
-            ),
-            PhaseSpec(
-                name="transform",
-                steps=[
-                    StepSpec(
-                        phase="transform",
-                        name="process-full-data",
-                        version=2,
-                        factory=lambda cfg: ProcessFullDataStep(
-                            name="transform.process-full-data", config=cfg)),
-                    StepSpec(
-                        phase="transform",
-                        name="create-dc-config",
-                        version=1,
-                        factory=lambda cfg: CreateDcConfigStep(
-                            name="transform.create-dc-config", config=cfg)),
-                ]
-            )
+            PhaseSpec(name="download",
+                      steps=[
+                          StepSpec(
+                              phase="download",
+                              name="download-data",
+                              version=1,
+                              factory=lambda cfg: DownloadDataStep(
+                                  name="download.download-data", config=cfg)),
+                      ]),
+            PhaseSpec(name="transform",
+                      steps=[
+                          StepSpec(phase="transform",
+                                   name="process-full-data",
+                                   version=2,
+                                   factory=lambda cfg: ProcessFullDataStep(
+                                       name="transform.process-full-data",
+                                       config=cfg)),
+                          StepSpec(phase="transform",
+                                   name="create-dc-config",
+                                   version=1,
+                                   factory=lambda cfg: CreateDcConfigStep(
+                                       name="transform.create-dc-config",
+                                       config=cfg)),
+                      ])
         ])
         state = self._state_with({
-            "download.download-data": (1, "succeeded"),
-            "transform.process-full-data": (1, "succeeded"),
-            "transform.create-dc-config": (1, "succeeded"),
+            "download.download-data": (1, "succeeded", 1000),
+            "transform.process-full-data": (1, "succeeded", 1000),
+            "transform.create-dc-config": (1, "succeeded", 1000),
         })
         cfg = PipelineConfig()
         names = self._names_from_builder(cfg, reg, state)
-        self.assertEqual(names, [
-            "transform.process-full-data", "transform.create-dc-config"
-        ])
+        self.assertEqual(
+            names,
+            ["transform.process-full-data", "transform.create-dc-config"])
 
         pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg)
-        self.assertEqual([s.name for s in pipeline.get_steps()],
-                         ["transform.process-full-data", "transform.create-dc-config"])
+        self.assertEqual(
+            [s.name for s in pipeline.get_steps()],
+            ["transform.process-full-data", "transform.create-dc-config"])
 
 
 if __name__ == "__main__":
diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py
index cb2cb85b02..f4d2af50b0 100644
--- a/tools/agentic_import/state_handler.py
+++ b/tools/agentic_import/state_handler.py
@@ -37,6 +37,8 @@ class StepState:
     started_at: str
     ended_at: str
     duration_s: float
+    started_at_ts: int | None = None
+    ended_at_ts: int | None = None
     message: str | None = None
 
 
@@ -47,6 +49,7 @@ class PipelineState:
     critical_input_hash: str
     command: str
     updated_at: str
+    updated_at_ts: int | None = None
     steps: dict[str, StepState] = field(default_factory=dict)
 
 
@@ -108,6 +111,7 @@ def _empty_state(self) -> PipelineState:
             critical_input_hash="",
             command="",
             updated_at="",
+            updated_at_ts=None,
         )
 
     def _backup_bad_file(self) -> None:
diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py
index b8af010fd7..3d3ad3a1d8 100644
--- a/tools/agentic_import/state_handler_test.py
+++ b/tools/agentic_import/state_handler_test.py
@@ -47,6 +47,7 @@ def test_missing_file_creates_empty_state(self) -> None:
                 data = json.load(fp)
             self.assertEqual(data["run_id"], "demo")
             self.assertEqual(data["steps"], {})
+            self.assertIsNone(data["updated_at_ts"])
 
     def test_corrupt_file_creates_backup_and_resets_state(self) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -67,6 +68,7 @@ def test_corrupt_file_creates_backup_and_resets_state(self) -> None:
             with open(path, encoding="utf-8") as fp:
                 data = json.load(fp)
             self.assertEqual(data["steps"], {})
+            self.assertIsNone(data.get("updated_at_ts"))
 
 
 if __name__ == "__main__":

From d6b0c9868b9757bcc079473e4218646532676886 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 12 Nov 2025 09:59:21 +0000
Subject: [PATCH 11/54] refactor(sdmx-import): Simplify phase factory lambda

The factory lambda in  for  is simplified by
removing redundant  and  arguments, as these are
already captured in the closure.
---
 tools/agentic_import/sdmx_import_pipeline.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 3e757e0fc3..65fe07ab22 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -386,8 +386,7 @@ def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec:
             phase=phase,
             name=step,
             version=cls.VERSION,
-            factory=lambda cfg, full_name=full, ctor=cls: ctor(name=full_name,
-                                                               config=cfg),
+            factory=lambda cfg: cls(name=full, config=cfg),
         )
 
     download = PhaseSpec(

From 7f15f17a85adb389be0d31f21f76885e52180ceb Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 12 Nov 2025 10:22:16 +0000
Subject: [PATCH 12/54] Refactor(sdmx-import): Flatten pipeline steps and
 remove phases

Removes the concept of Phases from the SDMX import pipeline, opting for a flat list of steps. This simplifies the pipeline structure and updates tests accordingly.
---
 tools/agentic_import/sdmx_import_pipeline.py  |  98 +++--------
 .../sdmx_import_pipeline_test.py              | 156 +++++++-----------
 2 files changed, 86 insertions(+), 168 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 65fe07ab22..36b10d5e24 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -275,41 +275,18 @@ def dry_run(self) -> None:
 
 @dataclass(frozen=True)
 class StepSpec:
-    phase: str
     name: str
     version: int
     factory: Callable[[PipelineConfig], Step]
 
-    @property
-    def full_name(self) -> str:
-        return _format_full_name(self.phase, self.name)
-
-
-@dataclass(frozen=True)
-class PhaseSpec:
-    name: str
-    steps: Sequence[StepSpec]
-
-
-@dataclass(frozen=True)
-class PhaseRegistry:
-    phases: Sequence[PhaseSpec]
-
-    def flatten(self) -> list[StepSpec]:
-        flattened: list[StepSpec] = []
-        for phase in self.phases:
-            flattened.extend(phase.steps)
-        return flattened
-
 
 class PipelineBuilder:
 
     def __init__(self, *, config: PipelineConfig, state: PipelineState,
-                 registry: PhaseRegistry) -> None:
+                 steps: Sequence[StepSpec]) -> None:
         self._config = config
         self._state = state
-        self._registry = registry
-        self._specs = self._registry.flatten()
+        self._specs = steps
 
     def build(self) -> Pipeline:
         planned = self._plan_steps()
@@ -342,19 +319,13 @@ def _plan_steps(self) -> list[StepSpec]:
 
     def _filter_run_only(self, specs: Sequence[StepSpec],
                          run_only: str) -> list[StepSpec]:
-        is_step = "." in run_only
-        if is_step:
-            scoped = [s for s in specs if s.full_name == run_only]
-        else:
-            scoped = [s for s in specs if s.phase == run_only]
-
+        scoped = [s for s in specs if s.name == run_only]
         if not scoped:
-            entity = "step" if is_step else "phase"
-            raise ValueError(f"run_only {entity} not found: {run_only}")
+            raise ValueError(f"run_only step not found: {run_only}")
         return scoped
 
     def _should_run(self, spec: StepSpec) -> bool:
-        prev = self._state.steps.get(spec.full_name)
+        prev = self._state.steps.get(spec.name)
         if prev is None:
             return True
         if prev.status != "succeeded":
@@ -364,8 +335,8 @@ def _should_run(self, spec: StepSpec) -> bool:
         return False
 
     def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool:
-        prev_state = self._state.steps.get(prev_spec.full_name)
-        curr_state = self._state.steps.get(spec.full_name)
+        prev_state = self._state.steps.get(prev_spec.name)
+        curr_state = self._state.steps.get(spec.name)
         if prev_state is None or prev_state.ended_at_ts is None:
             return False
         if curr_state is None:
@@ -377,52 +348,27 @@ def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool:
         return prev_state.ended_at_ts > curr_state.ended_at_ts
 
 
-def build_registry() -> PhaseRegistry:
-    """Constructs the hard-coded Phase 2 registry with canonical steps."""
+def build_step_specs() -> list[StepSpec]:
+    """Constructs the hard-coded list of canonical steps."""
 
-    def _spec(phase: str, step: str, cls: type[SdmxStep]) -> StepSpec:
-        full = _format_full_name(phase, step)
+    def _spec(name: str, cls: type[SdmxStep]) -> StepSpec:
         return StepSpec(
-            phase=phase,
-            name=step,
+            name=name,
             version=cls.VERSION,
-            factory=lambda cfg: cls(name=full, config=cfg),
+            factory=lambda cfg: cls(name=name, config=cfg),
         )
 
-    download = PhaseSpec(
-        name="download",
-        steps=[
-            _spec("download", "download-data", DownloadDataStep),
-            _spec("download", "download-metadata", DownloadMetadataStep),
-        ],
-    )
-    sample = PhaseSpec(
-        name="sample",
-        steps=[
-            _spec("sample", "create-sample", CreateSampleStep),
-        ],
-    )
-    schema_map = PhaseSpec(
-        name="schema_map",
-        steps=[
-            _spec("schema_map", "create-schema-mapping", CreateSchemaMapStep),
-        ],
-    )
-    transform = PhaseSpec(
-        name="transform",
-        steps=[
-            _spec("transform", "process-full-data", ProcessFullDataStep),
-            _spec("transform", "create-dc-config", CreateDcConfigStep),
-        ],
-    )
-    return PhaseRegistry(phases=[download, sample, schema_map, transform])
+    return [
+        _spec("download-data", DownloadDataStep),
+        _spec("download-metadata", DownloadMetadataStep),
+        _spec("create-sample", CreateSampleStep),
+        _spec("create-schema-mapping", CreateSchemaMapStep),
+        _spec("process-full-data", ProcessFullDataStep),
+        _spec("create-dc-config", CreateDcConfigStep),
+    ]
 
 
 def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState,
-                        registry: PhaseRegistry) -> Pipeline:
-    builder = PipelineBuilder(config=config, state=state, registry=registry)
+                        steps: Sequence[StepSpec]) -> Pipeline:
+    builder = PipelineBuilder(config=config, state=state, steps=steps)
     return builder.build()
-
-
-def _format_full_name(phase: str, step: str) -> str:
-    return f"{phase}.{step}"
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 0f26c61a7e..da13a52449 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -41,8 +41,8 @@
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     CreateDcConfigStep, DownloadDataStep, ProcessFullDataStep,
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
-    PhaseRegistry, PhaseSpec, StepSpec, SdmxStep, build_pipeline_callback,
-    build_registry, build_sdmx_pipeline)
+    StepSpec, SdmxStep, build_pipeline_callback, build_step_specs,
+    build_sdmx_pipeline)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -280,8 +280,8 @@ def test_interactive_mode_returns_composite(self) -> None:
 
 class PlanningTest(unittest.TestCase):
 
-    def _mk_registry(self) -> PhaseRegistry:
-        return build_registry()
+    def _mk_steps(self) -> list[StepSpec]:
+        return build_step_specs()
 
     def _empty_state(self) -> PipelineState:
         return PipelineState(run_id="demo",
@@ -314,128 +314,100 @@ def _state_with(
 
     def _names_from_builder(self,
                             cfg: PipelineConfig,
-                            reg: PhaseRegistry,
+                            steps: list[StepSpec],
                             state: PipelineState | None = None) -> list[str]:
         builder = PipelineBuilder(config=cfg,
                                   state=state or self._empty_state(),
-                                  registry=reg)
+                                  steps=steps)
         pipeline = builder.build()
         return [step.name for step in pipeline.get_steps()]
 
-    def test_run_only_phase_and_step(self) -> None:
-        reg = self._mk_registry()
-        cfg_phase = PipelineConfig(run_only="download")
-        names_phase = self._names_from_builder(cfg_phase, reg)
-        self.assertEqual(
-            names_phase,
-            ["download.download-data", "download.download-metadata"])
+    def test_run_only_step(self) -> None:
+        steps = self._mk_steps()
+        cfg_step = PipelineConfig(run_only="download-data")
+        names_step = self._names_from_builder(cfg_step, steps)
+        self.assertEqual(names_step, ["download-data"])
 
-        cfg_step = PipelineConfig(run_only="download.download-data")
-        names_step = self._names_from_builder(cfg_step, reg)
-        self.assertEqual(names_step, ["download.download-data"])
-
-        with self.assertRaisesRegex(ValueError, "run_only phase not found"):
-            self._names_from_builder(PipelineConfig(run_only="nope"), reg)
+        with self.assertRaisesRegex(ValueError, "run_only step not found"):
+            self._names_from_builder(PipelineConfig(run_only="nope"), steps)
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
             self._names_from_builder(PipelineConfig(run_only="download.nope"),
-                                     reg)
+                                     steps)
 
     def test_force_semantics(self) -> None:
-        reg = self._mk_registry()
+        steps = self._mk_steps()
         cfg_all = PipelineConfig(force=True)
-        names_all = self._names_from_builder(cfg_all, reg)
+        names_all = self._names_from_builder(cfg_all, steps)
         self.assertEqual(names_all, [
-            "download.download-data",
-            "download.download-metadata",
-            "sample.create-sample",
-            "schema_map.create-schema-mapping",
-            "transform.process-full-data",
-            "transform.create-dc-config",
+            "download-data",
+            "download-metadata",
+            "create-sample",
+            "create-schema-mapping",
+            "process-full-data",
+            "create-dc-config",
         ])
 
-        cfg_phase = PipelineConfig(run_only="download", force=True)
-        names_phase = self._names_from_builder(cfg_phase, reg)
-        self.assertEqual(
-            names_phase,
-            ["download.download-data", "download.download-metadata"])
-
     def test_timestamp_chaining_triggers_next_step(self) -> None:
-        reg = self._mk_registry()
+        steps = self._mk_steps()
         newer = 2_000
         older = 1_000
         state = self._state_with({
-            "download.download-data": (1, "succeeded", newer),
-            "download.download-metadata": (1, "succeeded", older),
-            "sample.create-sample": (1, "succeeded", older),
-            "schema_map.create-schema-mapping": (1, "succeeded", older),
-            "transform.process-full-data": (1, "succeeded", older),
-            "transform.create-dc-config": (1, "succeeded", older),
+            "download-data": (1, "succeeded", newer),
+            "download-metadata": (1, "succeeded", older),
+            "create-sample": (1, "succeeded", older),
+            "create-schema-mapping": (1, "succeeded", older),
+            "process-full-data": (1, "succeeded", older),
+            "create-dc-config": (1, "succeeded", older),
         })
         cfg = PipelineConfig()
-        names = self._names_from_builder(cfg, reg, state)
+        names = self._names_from_builder(cfg, steps, state)
         self.assertEqual(names, [
-            "download.download-metadata",
-            "sample.create-sample",
-            "schema_map.create-schema-mapping",
-            "transform.process-full-data",
-            "transform.create-dc-config",
+            "download-metadata",
+            "create-sample",
+            "create-schema-mapping",
+            "process-full-data",
+            "create-dc-config",
         ])
 
     def test_run_only_ignores_timestamp_chaining(self) -> None:
-        reg = self._mk_registry()
+        steps = self._mk_steps()
         newer = 4_000
         older = 3_000
         state = self._state_with({
-            "download.download-data": (1, "succeeded", newer),
-            "download.download-metadata": (1, "succeeded", older),
+            "download-data": (1, "succeeded", newer),
+            "download-metadata": (1, "succeeded", older),
         })
-        cfg = PipelineConfig(run_only="download")
-        names = self._names_from_builder(cfg, reg, state)
-        self.assertEqual(
-            names, ["download.download-data", "download.download-metadata"])
+        cfg = PipelineConfig(run_only="download-data")
+        names = self._names_from_builder(cfg, steps, state)
+        self.assertEqual(names, ["download-data"])
 
     def test_version_bump_schedules_downstream(self) -> None:
-        reg = PhaseRegistry(phases=[
-            PhaseSpec(name="download",
-                      steps=[
-                          StepSpec(
-                              phase="download",
-                              name="download-data",
-                              version=1,
-                              factory=lambda cfg: DownloadDataStep(
-                                  name="download.download-data", config=cfg)),
-                      ]),
-            PhaseSpec(name="transform",
-                      steps=[
-                          StepSpec(phase="transform",
-                                   name="process-full-data",
-                                   version=2,
-                                   factory=lambda cfg: ProcessFullDataStep(
-                                       name="transform.process-full-data",
-                                       config=cfg)),
-                          StepSpec(phase="transform",
-                                   name="create-dc-config",
-                                   version=1,
-                                   factory=lambda cfg: CreateDcConfigStep(
-                                       name="transform.create-dc-config",
-                                       config=cfg)),
-                      ])
-        ])
+        steps = [
+            StepSpec(name="download-data",
+                     version=1,
+                     factory=lambda cfg: DownloadDataStep(name="download-data",
+                                                          config=cfg)),
+            StepSpec(name="process-full-data",
+                     version=2,
+                     factory=lambda cfg: ProcessFullDataStep(
+                         name="process-full-data", config=cfg)),
+            StepSpec(name="create-dc-config",
+                     version=1,
+                     factory=lambda cfg: CreateDcConfigStep(
+                         name="create-dc-config", config=cfg)),
+        ]
         state = self._state_with({
-            "download.download-data": (1, "succeeded", 1000),
-            "transform.process-full-data": (1, "succeeded", 1000),
-            "transform.create-dc-config": (1, "succeeded", 1000),
+            "download-data": (1, "succeeded", 1000),
+            "process-full-data": (1, "succeeded", 1000),
+            "create-dc-config": (1, "succeeded", 1000),
         })
         cfg = PipelineConfig()
-        names = self._names_from_builder(cfg, reg, state)
-        self.assertEqual(
-            names,
-            ["transform.process-full-data", "transform.create-dc-config"])
-
-        pipeline = build_sdmx_pipeline(config=cfg, state=state, registry=reg)
-        self.assertEqual(
-            [s.name for s in pipeline.get_steps()],
-            ["transform.process-full-data", "transform.create-dc-config"])
+        names = self._names_from_builder(cfg, steps, state)
+        self.assertEqual(names, ["process-full-data", "create-dc-config"])
+
+        pipeline = build_sdmx_pipeline(config=cfg, state=state, steps=steps)
+        self.assertEqual([s.name for s in pipeline.get_steps()],
+                         ["process-full-data", "create-dc-config"])
 
 
 if __name__ == "__main__":

From 475ba5bf8c47830bda252e755b8050d08bd00ddc Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Wed, 12 Nov 2025 12:08:22 +0000
Subject: [PATCH 13/54] refactor(sdmx-import): Simplify pipeline step
 definition and construction

Removes the StepSpec dataclass and updates PipelineBuilder to directly
accept Step objects. The build_step_specs function is replaced by
build_steps, which directly instantiates Step objects. This simplifies
the overall pipeline step definition and construction process.
---
 tools/agentic_import/sdmx_import_pipeline.py  | 87 ++++++++-----------
 .../sdmx_import_pipeline_test.py              | 56 ++++++------
 2 files changed, 62 insertions(+), 81 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 36b10d5e24..a6ef7d274f 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -273,70 +273,62 @@ def dry_run(self) -> None:
         logging.info(f"{self.name} (dry run): previewing DC config creation")
 
 
-@dataclass(frozen=True)
-class StepSpec:
-    name: str
-    version: int
-    factory: Callable[[PipelineConfig], Step]
-
-
 class PipelineBuilder:
 
     def __init__(self, *, config: PipelineConfig, state: PipelineState,
-                 steps: Sequence[StepSpec]) -> None:
+                 steps: Sequence[Step]) -> None:
         self._config = config
         self._state = state
-        self._specs = steps
+        self._steps = steps
 
     def build(self) -> Pipeline:
         planned = self._plan_steps()
-        steps = [spec.factory(self._config) for spec in planned]
-        logging.info("Built SDMX pipeline with %d steps", len(steps))
-        return Pipeline(steps=steps)
+        logging.info("Built SDMX pipeline with %d steps", len(planned))
+        return Pipeline(steps=planned)
 
-    def _plan_steps(self) -> list[StepSpec]:
+    def _plan_steps(self) -> list[Step]:
         if self._config.run_only:
-            return self._filter_run_only(self._specs, self._config.run_only)
+            return self._filter_run_only(self._steps, self._config.run_only)
         if self._config.force:
-            return list(self._specs)
-        scheduled: list[StepSpec] = []
+            return list(self._steps)
+        scheduled: list[Step] = []
         schedule_all_remaining = False
-        previous: StepSpec | None = None
-        for spec in self._specs:
+        previous: Step | None = None
+        for step in self._steps:
             if schedule_all_remaining:
-                scheduled.append(spec)
+                scheduled.append(step)
             else:
-                needs_run = self._should_run(spec)
+                needs_run = self._should_run(step)
                 if not needs_run and previous is not None:
-                    needs_run = self._predecessor_newer(previous, spec)
+                    needs_run = self._predecessor_newer(previous, step)
                 if needs_run:
-                    scheduled.append(spec)
+                    scheduled.append(step)
                     schedule_all_remaining = True
-            previous = spec
+            previous = step
         if not scheduled:
             logging.info("No steps scheduled.")
         return scheduled
 
-    def _filter_run_only(self, specs: Sequence[StepSpec],
-                         run_only: str) -> list[StepSpec]:
-        scoped = [s for s in specs if s.name == run_only]
+    def _filter_run_only(self, steps: Sequence[Step],
+                         run_only: str) -> list[Step]:
+        scoped = [s for s in steps if s.name == run_only]
         if not scoped:
             raise ValueError(f"run_only step not found: {run_only}")
         return scoped
 
-    def _should_run(self, spec: StepSpec) -> bool:
-        prev = self._state.steps.get(spec.name)
+    def _should_run(self, step: Step) -> bool:
+        prev = self._state.steps.get(step.name)
         if prev is None:
             return True
         if prev.status != "succeeded":
             return True
-        if prev.version < spec.version:
+        if prev.version < step.version:
             return True
         return False
 
-    def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool:
-        prev_state = self._state.steps.get(prev_spec.name)
-        curr_state = self._state.steps.get(spec.name)
+    def _predecessor_newer(self, prev_step: Step, step: Step) -> bool:
+        prev_state = self._state.steps.get(prev_step.name)
+        curr_state = self._state.steps.get(step.name)
         if prev_state is None or prev_state.ended_at_ts is None:
             return False
         if curr_state is None:
@@ -348,27 +340,22 @@ def _predecessor_newer(self, prev_spec: StepSpec, spec: StepSpec) -> bool:
         return prev_state.ended_at_ts > curr_state.ended_at_ts
 
 
-def build_step_specs() -> list[StepSpec]:
+def build_steps(config: PipelineConfig) -> list[Step]:
     """Constructs the hard-coded list of canonical steps."""
-
-    def _spec(name: str, cls: type[SdmxStep]) -> StepSpec:
-        return StepSpec(
-            name=name,
-            version=cls.VERSION,
-            factory=lambda cfg: cls(name=name, config=cfg),
-        )
-
     return [
-        _spec("download-data", DownloadDataStep),
-        _spec("download-metadata", DownloadMetadataStep),
-        _spec("create-sample", CreateSampleStep),
-        _spec("create-schema-mapping", CreateSchemaMapStep),
-        _spec("process-full-data", ProcessFullDataStep),
-        _spec("create-dc-config", CreateDcConfigStep),
+        DownloadDataStep(name="download-data", config=config),
+        DownloadMetadataStep(name="download-metadata", config=config),
+        CreateSampleStep(name="create-sample", config=config),
+        CreateSchemaMapStep(name="create-schema-mapping", config=config),
+        ProcessFullDataStep(name="process-full-data", config=config),
+        CreateDcConfigStep(name="create-dc-config", config=config),
     ]
 
 
-def build_sdmx_pipeline(*, config: PipelineConfig, state: PipelineState,
-                        steps: Sequence[StepSpec]) -> Pipeline:
-    builder = PipelineBuilder(config=config, state=state, steps=steps)
+def build_sdmx_pipeline(*,
+                        config: PipelineConfig,
+                        state: PipelineState,
+                        steps: Sequence[Step] | None = None) -> Pipeline:
+    builder_steps = steps if steps is not None else build_steps(config)
+    builder = PipelineBuilder(config=config, state=state, steps=builder_steps)
     return builder.build()
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index da13a52449..b6344933bb 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -39,10 +39,8 @@
     RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
-    CreateDcConfigStep, DownloadDataStep, ProcessFullDataStep,
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
-    StepSpec, SdmxStep, build_pipeline_callback, build_step_specs,
-    build_sdmx_pipeline)
+    build_pipeline_callback, build_sdmx_pipeline, build_steps)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -76,6 +74,18 @@ def dry_run(self) -> None:
         logging.info("noop")
 
 
+class _VersionedStep(BaseStep):
+
+    def __init__(self, name: str, version: int) -> None:
+        super().__init__(name=name, version=version)
+
+    def run(self) -> None:
+        logging.info("noop")
+
+    def dry_run(self) -> None:
+        logging.info("noop")
+
+
 class JSONStateCallbackTest(unittest.TestCase):
 
     def _build_callback(
@@ -280,9 +290,6 @@ def test_interactive_mode_returns_composite(self) -> None:
 
 class PlanningTest(unittest.TestCase):
 
-    def _mk_steps(self) -> list[StepSpec]:
-        return build_step_specs()
-
     def _empty_state(self) -> PipelineState:
         return PipelineState(run_id="demo",
                              critical_input_hash="",
@@ -314,30 +321,28 @@ def _state_with(
 
     def _names_from_builder(self,
                             cfg: PipelineConfig,
-                            steps: list[StepSpec],
+                            steps: list[BaseStep] | None = None,
                             state: PipelineState | None = None) -> list[str]:
+        builder_steps = steps or build_steps(cfg)
         builder = PipelineBuilder(config=cfg,
                                   state=state or self._empty_state(),
-                                  steps=steps)
+                                  steps=builder_steps)
         pipeline = builder.build()
         return [step.name for step in pipeline.get_steps()]
 
     def test_run_only_step(self) -> None:
-        steps = self._mk_steps()
         cfg_step = PipelineConfig(run_only="download-data")
-        names_step = self._names_from_builder(cfg_step, steps)
+        names_step = self._names_from_builder(cfg_step)
         self.assertEqual(names_step, ["download-data"])
 
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
-            self._names_from_builder(PipelineConfig(run_only="nope"), steps)
+            self._names_from_builder(PipelineConfig(run_only="nope"))
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
-            self._names_from_builder(PipelineConfig(run_only="download.nope"),
-                                     steps)
+            self._names_from_builder(PipelineConfig(run_only="download.nope"))
 
     def test_force_semantics(self) -> None:
-        steps = self._mk_steps()
         cfg_all = PipelineConfig(force=True)
-        names_all = self._names_from_builder(cfg_all, steps)
+        names_all = self._names_from_builder(cfg_all)
         self.assertEqual(names_all, [
             "download-data",
             "download-metadata",
@@ -348,7 +353,6 @@ def test_force_semantics(self) -> None:
         ])
 
     def test_timestamp_chaining_triggers_next_step(self) -> None:
-        steps = self._mk_steps()
         newer = 2_000
         older = 1_000
         state = self._state_with({
@@ -360,7 +364,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
             "create-dc-config": (1, "succeeded", older),
         })
         cfg = PipelineConfig()
-        names = self._names_from_builder(cfg, steps, state)
+        names = self._names_from_builder(cfg, state=state)
         self.assertEqual(names, [
             "download-metadata",
             "create-sample",
@@ -370,7 +374,6 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
         ])
 
     def test_run_only_ignores_timestamp_chaining(self) -> None:
-        steps = self._mk_steps()
         newer = 4_000
         older = 3_000
         state = self._state_with({
@@ -378,23 +381,14 @@ def test_run_only_ignores_timestamp_chaining(self) -> None:
             "download-metadata": (1, "succeeded", older),
         })
         cfg = PipelineConfig(run_only="download-data")
-        names = self._names_from_builder(cfg, steps, state)
+        names = self._names_from_builder(cfg, state=state)
         self.assertEqual(names, ["download-data"])
 
     def test_version_bump_schedules_downstream(self) -> None:
         steps = [
-            StepSpec(name="download-data",
-                     version=1,
-                     factory=lambda cfg: DownloadDataStep(name="download-data",
-                                                          config=cfg)),
-            StepSpec(name="process-full-data",
-                     version=2,
-                     factory=lambda cfg: ProcessFullDataStep(
-                         name="process-full-data", config=cfg)),
-            StepSpec(name="create-dc-config",
-                     version=1,
-                     factory=lambda cfg: CreateDcConfigStep(
-                         name="create-dc-config", config=cfg)),
+            _VersionedStep("download-data", 1),
+            _VersionedStep("process-full-data", 2),
+            _VersionedStep("create-dc-config", 1),
         ]
         state = self._state_with({
             "download-data": (1, "succeeded", 1000),

From 24d0e0cee3039f16c981506263c918a0bf61fd8e Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 13 Nov 2025 02:46:39 +0000
Subject: [PATCH 14/54] feat(sdmx-import): add run_sdmx_pipeline orchestration
 resolve dataset prefix/hash, working dir, and state metadata before running
 update state handler/tests to expect dataset_prefix and command

---
 tools/agentic_import/sdmx_import_pipeline.py  |  90 ++++++++++++-
 .../sdmx_import_pipeline_test.py              | 124 +++++++++++++++---
 tools/agentic_import/state_handler.py         |   9 +-
 tools/agentic_import/state_handler_test.py    |   4 +-
 4 files changed, 201 insertions(+), 26 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index a6ef7d274f..c6946a4e49 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -15,15 +15,20 @@
 
 from __future__ import annotations
 
+import hashlib
+import json
+import os
+import re
 from dataclasses import dataclass
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import Callable, Sequence
 
 from absl import logging
 
 from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
                                            PipelineAbort, PipelineCallback,
-                                           Step)
+                                           PipelineRunner, RunnerConfig, Step)
 from tools.agentic_import.state_handler import (PipelineState, StateHandler,
                                                 StepState)
 
@@ -57,14 +62,14 @@ class JSONStateCallback(PipelineCallback):
     def __init__(self,
                  *,
                  state_handler: StateHandler,
-                 run_id: str,
+                 dataset_prefix: str,
                  critical_input_hash: str,
                  command: str,
                  now_fn: Callable[[], datetime] | None = None) -> None:
         self._handler = state_handler
         self._now_fn = now_fn or (lambda: datetime.now(timezone.utc))
         self._state = self._handler.get_state()
-        self._state.run_id = run_id
+        self._state.dataset_prefix = dataset_prefix
         self._state.critical_input_hash = critical_input_hash
         self._state.command = command
         self._step_start_times: dict[str, datetime] = {}
@@ -115,7 +120,7 @@ def _now(self) -> datetime:
 def build_pipeline_callback(
     *,
     state_handler: StateHandler,
-    run_id: str,
+    dataset_prefix: str,
     critical_input_hash: str,
     command: str,
     skip_confirmation: bool,
@@ -123,7 +128,7 @@ def build_pipeline_callback(
 ) -> PipelineCallback:
     """Constructs the pipeline callback stack for the SDMX runner."""
     json_callback = JSONStateCallback(state_handler=state_handler,
-                                      run_id=run_id,
+                                      dataset_prefix=dataset_prefix,
                                       critical_input_hash=critical_input_hash,
                                       command=command,
                                       now_fn=now_fn)
@@ -141,6 +146,7 @@ class PipelineConfig:
     phase. Defaults are intentionally minimal.
     """
 
+    command: str
     endpoint: str | None = None
     agency: str | None = None
     dataflow: str | None = None
@@ -359,3 +365,77 @@ def build_sdmx_pipeline(*,
     builder_steps = steps if steps is not None else build_steps(config)
     builder = PipelineBuilder(config=config, state=state, steps=builder_steps)
     return builder.build()
+
+
+def _sanitize_run_id(dataflow: str) -> str:
+    normalized = dataflow.lower()
+    normalized = re.sub(r"[^a-z0-9_]+", "_", normalized)
+    normalized = re.sub(r"_+", "_", normalized)
+    return normalized.strip("_")
+
+
+def _resolve_dataset_prefix(config: PipelineConfig) -> str:
+    if config.dataset_prefix:
+        return config.dataset_prefix
+    if not config.dataflow:
+        raise ValueError(
+            "dataflow or dataset_prefix is required to derive dataset prefix")
+    sanitized = _sanitize_run_id(config.dataflow)
+    if not sanitized:
+        raise ValueError("dataflow value is invalid after sanitization")
+    return sanitized
+
+
+def _compute_critical_input_hash(config: PipelineConfig) -> str:
+    payload = {
+        "agency": config.agency,
+        "dataflow": config.dataflow,
+        "endpoint": config.endpoint,
+        "key": config.key,
+    }
+    serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"))
+    return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
+
+
+def _resolve_working_dir(config: PipelineConfig) -> Path:
+    directory = Path(config.working_dir or os.getcwd())
+    if directory.exists():
+        if not directory.is_dir():
+            raise ValueError(f"working_dir is not a directory: {directory}")
+    else:
+        directory.mkdir(parents=True, exist_ok=True)
+    return directory
+
+
+def run_sdmx_pipeline(
+    *,
+    config: PipelineConfig,
+    now_fn: Callable[[], datetime] | None = None,
+) -> None:
+    """Orchestrates the SDMX pipeline for the provided configuration."""
+    working_dir = _resolve_working_dir(config)
+    dataset_prefix = _resolve_dataset_prefix(config)
+    state_handler = StateHandler(
+        state_path=working_dir / ".datacommons" /
+        f"{dataset_prefix}.state.json",
+        dataset_prefix=dataset_prefix,
+    )
+    state = state_handler.get_state()
+    critical_hash = _compute_critical_input_hash(config)
+    state.dataset_prefix = dataset_prefix
+    state.command = config.command
+    state.critical_input_hash = critical_hash
+    state_handler.save_state()
+    pipeline = build_sdmx_pipeline(config=config, state=state)
+    callback = build_pipeline_callback(
+        state_handler=state_handler,
+        dataset_prefix=dataset_prefix,
+        critical_input_hash=critical_hash,
+        command=config.command,
+        skip_confirmation=config.skip_confirmation,
+        now_fn=now_fn,
+    )
+    if config.verbose:
+        logging.set_verbosity(logging.DEBUG)
+    runner = PipelineRunner(RunnerConfig())
+    runner.run(pipeline, callback)
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index b6344933bb..4c6a35db86 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -17,12 +17,15 @@
 
 from __future__ import annotations
 
+import hashlib
+import dataclasses
 import json
 import os
 import sys
 import tempfile
 import unittest
 from datetime import datetime, timedelta, timezone
+from pathlib import Path
 from unittest import mock
 
 from absl import logging
@@ -34,13 +37,16 @@
     if path not in sys.path:
         sys.path.append(path)
 
+_TEST_COMMAND = "sdmx pipeline test"
+
 from tools.agentic_import.pipeline import (  # pylint: disable=import-error
     BaseStep, CompositeCallback, Pipeline, PipelineAbort, PipelineRunner,
     RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
-    build_pipeline_callback, build_sdmx_pipeline, build_steps)
+    build_pipeline_callback, build_sdmx_pipeline, build_steps,
+    run_sdmx_pipeline)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -95,7 +101,7 @@ def _build_callback(
         handler = StateHandler(state_path=state_path, dataset_prefix="demo")
         callback = JSONStateCallback(
             state_handler=handler,
-            run_id="demo",
+            dataset_prefix="demo",
             critical_input_hash="abc123",
             command="python run",
             now_fn=clock,
@@ -117,7 +123,7 @@ def test_successful_step_persists_expected_schema(self) -> None:
                 state = json.load(fp)
 
         step_state = state["steps"]["download.download-data"]
-        self.assertEqual(state["run_id"], "demo")
+        self.assertEqual(state["dataset_prefix"], "demo")
         self.assertEqual(state["critical_input_hash"], "abc123")
         self.assertEqual(step_state["status"], "succeeded")
         self.assertIn("started_at", step_state)
@@ -167,7 +173,7 @@ def test_abort_skips_state_persistence(self) -> None:
             os.makedirs(state_dir, exist_ok=True)
             state_path = os.path.join(state_dir, "demo.state.json")
             previous = {
-                "run_id": "previous",
+                "dataset_prefix": "previous",
                 "critical_input_hash": "old",
                 "command": "old command",
                 "updated_at": "2025-01-01T00:00:00Z",
@@ -267,7 +273,7 @@ def test_skip_confirmation_returns_json_callback(self) -> None:
             handler = self._state_handler_for_tmpdir(tmpdir)
             callback = build_pipeline_callback(
                 state_handler=handler,
-                run_id="demo",
+                dataset_prefix="demo",
                 critical_input_hash="abc",
                 command="python run",
                 skip_confirmation=True,
@@ -280,7 +286,7 @@ def test_interactive_mode_returns_composite(self) -> None:
             with mock.patch("builtins.input", return_value="y"):
                 callback = build_pipeline_callback(
                     state_handler=handler,
-                    run_id="demo",
+                    dataset_prefix="demo",
                     critical_input_hash="abc",
                     command="python run",
                     skip_confirmation=False,
@@ -291,7 +297,7 @@ def test_interactive_mode_returns_composite(self) -> None:
 class PlanningTest(unittest.TestCase):
 
     def _empty_state(self) -> PipelineState:
-        return PipelineState(run_id="demo",
+        return PipelineState(dataset_prefix="demo",
                              critical_input_hash="",
                              command="",
                              updated_at="",
@@ -312,7 +318,7 @@ def _state_with(
                           message=None)
             for name, (v, st, ts) in versions.items()
         }
-        return PipelineState(run_id="demo",
+        return PipelineState(dataset_prefix="demo",
                              critical_input_hash="",
                              command="",
                              updated_at="",
@@ -331,17 +337,20 @@ def _names_from_builder(self,
         return [step.name for step in pipeline.get_steps()]
 
     def test_run_only_step(self) -> None:
-        cfg_step = PipelineConfig(run_only="download-data")
+        cfg_step = PipelineConfig(command=_TEST_COMMAND,
+                                  run_only="download-data")
         names_step = self._names_from_builder(cfg_step)
         self.assertEqual(names_step, ["download-data"])
 
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
-            self._names_from_builder(PipelineConfig(run_only="nope"))
+            self._names_from_builder(
+                PipelineConfig(command=_TEST_COMMAND, run_only="nope"))
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
-            self._names_from_builder(PipelineConfig(run_only="download.nope"))
+            self._names_from_builder(
+                PipelineConfig(command=_TEST_COMMAND, run_only="download.nope"))
 
     def test_force_semantics(self) -> None:
-        cfg_all = PipelineConfig(force=True)
+        cfg_all = PipelineConfig(command=_TEST_COMMAND, force=True)
         names_all = self._names_from_builder(cfg_all)
         self.assertEqual(names_all, [
             "download-data",
@@ -363,7 +372,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
             "process-full-data": (1, "succeeded", older),
             "create-dc-config": (1, "succeeded", older),
         })
-        cfg = PipelineConfig()
+        cfg = PipelineConfig(command=_TEST_COMMAND)
         names = self._names_from_builder(cfg, state=state)
         self.assertEqual(names, [
             "download-metadata",
@@ -380,7 +389,7 @@ def test_run_only_ignores_timestamp_chaining(self) -> None:
             "download-data": (1, "succeeded", newer),
             "download-metadata": (1, "succeeded", older),
         })
-        cfg = PipelineConfig(run_only="download-data")
+        cfg = PipelineConfig(command=_TEST_COMMAND, run_only="download-data")
         names = self._names_from_builder(cfg, state=state)
         self.assertEqual(names, ["download-data"])
 
@@ -395,7 +404,7 @@ def test_version_bump_schedules_downstream(self) -> None:
             "process-full-data": (1, "succeeded", 1000),
             "create-dc-config": (1, "succeeded", 1000),
         })
-        cfg = PipelineConfig()
+        cfg = PipelineConfig(command=_TEST_COMMAND)
         names = self._names_from_builder(cfg, steps, state)
         self.assertEqual(names, ["process-full-data", "create-dc-config"])
 
@@ -404,5 +413,90 @@ def test_version_bump_schedules_downstream(self) -> None:
                          ["process-full-data", "create-dc-config"])
 
 
+class RunPipelineTest(unittest.TestCase):
+
+    def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None,
+                      command: str) -> PipelineConfig:
+        return PipelineConfig(endpoint="https://api.example.com",
+                              agency="TEST_AGENCY",
+                              dataflow=dataflow,
+                              key="test-key",
+                              dataset_prefix=dataset_prefix,
+                              working_dir=self._tmpdir,
+                              skip_confirmation=True,
+                              command=command)
+
+    def setUp(self) -> None:
+        self._tmpdir_obj = tempfile.TemporaryDirectory()
+        self.addCleanup(self._tmpdir_obj.cleanup)
+        self._tmpdir = self._tmpdir_obj.name
+
+    def test_run_pipeline_updates_state_and_hash(self) -> None:
+        command = "sdmx run pipeline"
+        config = self._build_config(dataset_prefix="demo",
+                                    dataflow="df.1",
+                                    command=command)
+        clock = _IncrementingClock(datetime(2025, 1, 2, tzinfo=timezone.utc),
+                                   timedelta(seconds=2))
+
+        run_sdmx_pipeline(config=config, now_fn=clock)
+
+        state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
+        self.assertTrue(state_path.exists())
+        with state_path.open(encoding="utf-8") as fp:
+            state = json.load(fp)
+
+        expected_hash = hashlib.sha256(
+            json.dumps(
+                {
+                    "agency": config.agency,
+                    "dataflow": config.dataflow,
+                    "endpoint": config.endpoint,
+                    "key": config.key,
+                },
+                sort_keys=True,
+                separators=(",", ":")).encode("utf-8")).hexdigest()
+
+        self.assertEqual(state["dataset_prefix"], "demo")
+        self.assertEqual(state["command"], command)
+        self.assertEqual(state["critical_input_hash"], expected_hash)
+        self.assertEqual(len(state["steps"]), 6)
+
+        for step_name in [
+                "download-data", "download-metadata", "create-sample",
+                "create-schema-mapping", "process-full-data", "create-dc-config"
+        ]:
+            self.assertIn(step_name, state["steps"])
+            self.assertEqual(state["steps"][step_name]["status"], "succeeded")
+
+    def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
+        dataflow = "My Flow-Name 2025!!!"
+        config = self._build_config(dataset_prefix=None,
+                                    dataflow=dataflow,
+                                    command="sdmx run sanitized")
+        run_sdmx_pipeline(config=config,
+                          now_fn=_IncrementingClock(
+                              datetime(2025, 1, 3, tzinfo=timezone.utc),
+                              timedelta(seconds=2)))
+
+        expected_run_id = "my_flow_name_2025"
+        state_path = Path(
+            self._tmpdir) / ".datacommons" / f"{expected_run_id}.state.json"
+        self.assertTrue(state_path.exists())
+        with state_path.open(encoding="utf-8") as fp:
+            state = json.load(fp)
+        self.assertEqual(state["dataset_prefix"], expected_run_id)
+
+    def test_invalid_working_dir_raises(self) -> None:
+        path = Path(self._tmpdir) / "not_a_dir"
+        path.write_text("content")
+        config = dataclasses.replace(self._build_config(
+            dataset_prefix="demo", dataflow="df", command="sdmx run invalid"),
+                                     working_dir=str(path))
+        with self.assertRaisesRegex(ValueError,
+                                    "working_dir is not a directory"):
+            run_sdmx_pipeline(config=config)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py
index f4d2af50b0..31dabccc1f 100644
--- a/tools/agentic_import/state_handler.py
+++ b/tools/agentic_import/state_handler.py
@@ -45,7 +45,7 @@ class StepState:
 @dataclass_json
 @dataclass
 class PipelineState:
-    run_id: str
+    dataset_prefix: str
     critical_input_hash: str
     command: str
     updated_at: str
@@ -87,8 +87,9 @@ def _load_or_init(self) -> PipelineState:
             with path.open("r", encoding="utf-8") as fp:
                 data = json.load(fp)
             state = PipelineState.from_dict(data)
-            if not state.run_id:
-                state.run_id = self._dataset_prefix
+            if not state.dataset_prefix:
+                # Ensure a manual or corrupted state file still has prefix metadata.
+                state.dataset_prefix = self._dataset_prefix
             return state
         except (OSError, json.JSONDecodeError, ValueError, TypeError) as exc:
             logging.warning(f"Failed to load state file {path}: {exc}")
@@ -107,7 +108,7 @@ def _write_state(self, state: PipelineState) -> None:
 
     def _empty_state(self) -> PipelineState:
         return PipelineState(
-            run_id=self._dataset_prefix,
+            dataset_prefix=self._dataset_prefix,
             critical_input_hash="",
             command="",
             updated_at="",
diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py
index 3d3ad3a1d8..c000260f6d 100644
--- a/tools/agentic_import/state_handler_test.py
+++ b/tools/agentic_import/state_handler_test.py
@@ -40,12 +40,12 @@ def test_missing_file_creates_empty_state(self) -> None:
             state = handler.get_state()
 
             self.assertTrue(os.path.exists(path))
-            self.assertEqual(state.run_id, "demo")
+            self.assertEqual(state.dataset_prefix, "demo")
             self.assertEqual(state.steps, {})
 
             with open(path, encoding="utf-8") as fp:
                 data = json.load(fp)
-            self.assertEqual(data["run_id"], "demo")
+            self.assertEqual(data["dataset_prefix"], "demo")
             self.assertEqual(data["steps"], {})
             self.assertIsNone(data["updated_at_ts"])
 

From c41eb3f6172e7065bd4c06cadc05de4e5fab22d5 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 13 Nov 2025 04:27:55 +0000
Subject: [PATCH 15/54] feat(sdmx-import): add CLI entrypoint for SDMX pipeline
 Map absl flags into PipelineConfig and add a runnable main.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 78 +++++++++++++++++--
 .../sdmx_import_pipeline_test.py              |  6 +-
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index c6946a4e49..4714f1117c 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -19,12 +19,14 @@
 import json
 import os
 import re
+import shlex
+import sys
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Callable, Sequence
 
-from absl import logging
+from absl import app, flags, logging
 
 from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
                                            PipelineAbort, PipelineCallback,
@@ -32,6 +34,40 @@
 from tools.agentic_import.state_handler import (PipelineState, StateHandler,
                                                 StepState)
 
+FLAGS = flags.FLAGS
+
+
+def _define_flags() -> None:
+    flags.DEFINE_string("endpoint", None, "SDMX service endpoint.")
+    flags.mark_flag_as_required("endpoint")
+
+    flags.DEFINE_string("agency", None, "Owning SDMX agency identifier.")
+    flags.mark_flag_as_required("agency")
+
+    flags.DEFINE_string("dataflow", None, "Target SDMX dataflow identifier.")
+    flags.mark_flag_as_required("dataflow")
+
+    flags.DEFINE_string("dataflow_key", None, "Optional SDMX key or filter.")
+    flags.DEFINE_alias("key", "dataflow_key")
+
+    flags.DEFINE_string(
+        "dataflow_param", None,
+        "Optional SDMX parameter appended to the dataflow query.")
+
+    flags.DEFINE_string(
+        "dataset_prefix", None,
+        "Optional dataset prefix to override auto-derived values.")
+
+    flags.DEFINE_string("run_only", None,
+                        "Execute only a specific pipeline step by name.")
+
+    flags.DEFINE_boolean("force", False, "Force all steps to run.")
+
+    flags.DEFINE_boolean("verbose", False, "Enable verbose logging.")
+
+    flags.DEFINE_boolean("skip_confirmation", False,
+                         "Skip interactive confirmation prompts.")
+
 
 def _format_time(value: datetime) -> str:
     if value.tzinfo is None:
@@ -150,15 +186,15 @@ class PipelineConfig:
     endpoint: str | None = None
     agency: str | None = None
     dataflow: str | None = None
-    key: str | None = None
+    dataflow_key: str | None = None
+    dataflow_param: str | None = None
     dataset_prefix: str | None = None
-    working_dir: str | None = None
+    working_dir: str | None = None  # TODO: Add CLI flag once semantics stabilize.
     run_only: str | None = None
     force: bool = False
     verbose: bool = False
     skip_confirmation: bool = False
 
-
 class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
@@ -391,7 +427,8 @@ def _compute_critical_input_hash(config: PipelineConfig) -> str:
         "agency": config.agency,
         "dataflow": config.dataflow,
         "endpoint": config.endpoint,
-        "key": config.key,
+        "dataflow_key": config.dataflow_key,
+        "dataflow_param": config.dataflow_param,
     }
     serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"))
     return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
@@ -439,3 +476,34 @@ def run_sdmx_pipeline(
         logging.set_verbosity(logging.DEBUG)
     runner = PipelineRunner(RunnerConfig())
     runner.run(pipeline, callback)
+
+
+def prepare_config() -> PipelineConfig:
+    """Builds PipelineConfig from CLI flags."""
+    command = shlex.join(sys.argv) if sys.argv else "python"
+    return PipelineConfig(
+        command=command,
+        endpoint=FLAGS.endpoint,
+        agency=FLAGS.agency,
+        dataflow=FLAGS.dataflow,
+        dataflow_key=FLAGS.dataflow_key,
+        dataflow_param=FLAGS.dataflow_param,
+        dataset_prefix=FLAGS.dataset_prefix,
+        working_dir=None,
+        run_only=FLAGS.run_only,
+        force=FLAGS.force,
+        verbose=FLAGS.verbose,
+        skip_confirmation=FLAGS.skip_confirmation,
+    )
+
+
+def main(_: list[str]) -> int:
+    config = prepare_config()
+    logging.info(f"SDMX pipeline configuration: {config}")
+    run_sdmx_pipeline(config=config)
+    return 0
+
+
+if __name__ == "__main__":
+    _define_flags()
+    app.run(main)
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 4c6a35db86..2007e854cb 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -420,7 +420,8 @@ def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None,
         return PipelineConfig(endpoint="https://api.example.com",
                               agency="TEST_AGENCY",
                               dataflow=dataflow,
-                              key="test-key",
+                              dataflow_key="test-key",
+                              dataflow_param="area=US",
                               dataset_prefix=dataset_prefix,
                               working_dir=self._tmpdir,
                               skip_confirmation=True,
@@ -452,7 +453,8 @@ def test_run_pipeline_updates_state_and_hash(self) -> None:
                     "agency": config.agency,
                     "dataflow": config.dataflow,
                     "endpoint": config.endpoint,
-                    "key": config.key,
+                    "dataflow_key": config.dataflow_key,
+                    "dataflow_param": config.dataflow_param,
                 },
                 sort_keys=True,
                 separators=(",", ":")).encode("utf-8")).hexdigest()

From 4be87a72b0722d6fea0e4b3cc8b5c8150d674d1a Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Fri, 14 Nov 2025 04:48:28 +0000
Subject: [PATCH 16/54] refactor: centralize SDMX pipeline planning

move hash comparison into PipelineBuilder, snapshot state before planning, and add tests covering rerun vs no-op flows
---
 tools/agentic_import/sdmx_import_pipeline.py  | 52 ++++++++++++++-----
 .../sdmx_import_pipeline_test.py              | 47 +++++++++++++++++
 2 files changed, 85 insertions(+), 14 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 4714f1117c..aad7d6e0de 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import copy
 import hashlib
 import json
 import os
@@ -195,6 +196,7 @@ class PipelineConfig:
     verbose: bool = False
     skip_confirmation: bool = False
 
+
 class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
@@ -317,22 +319,32 @@ def dry_run(self) -> None:
 
 class PipelineBuilder:
 
-    def __init__(self, *, config: PipelineConfig, state: PipelineState,
-                 steps: Sequence[Step]) -> None:
+    def __init__(self,
+                 *,
+                 config: PipelineConfig,
+                 state: PipelineState,
+                 steps: Sequence[Step],
+                 critical_input_hash: str | None = None) -> None:
         self._config = config
         self._state = state
         self._steps = steps
+        self._critical_input_hash = critical_input_hash
 
     def build(self) -> Pipeline:
-        planned = self._plan_steps()
+        if self._config.run_only:
+            planned = self._filter_run_only(self._steps, self._config.run_only)
+        elif self._config.force:
+            logging.info("Force flag set; scheduling all SDMX steps")
+            planned = list(self._steps)
+        elif self._hash_changed():
+            logging.info("Critical inputs changed; scheduling all SDMX steps")
+            planned = list(self._steps)
+        else:
+            planned = self._plan_steps()
         logging.info("Built SDMX pipeline with %d steps", len(planned))
         return Pipeline(steps=planned)
 
     def _plan_steps(self) -> list[Step]:
-        if self._config.run_only:
-            return self._filter_run_only(self._steps, self._config.run_only)
-        if self._config.force:
-            return list(self._steps)
         scheduled: list[Step] = []
         schedule_all_remaining = False
         previous: Step | None = None
@@ -358,6 +370,14 @@ def _filter_run_only(self, steps: Sequence[Step],
             raise ValueError(f"run_only step not found: {run_only}")
         return scoped
 
+    def _hash_changed(self) -> bool:
+        if not self._critical_input_hash:
+            return False
+        previous = self._state.critical_input_hash
+        if not previous:
+            return True
+        return previous != self._critical_input_hash
+
     def _should_run(self, step: Step) -> bool:
         prev = self._state.steps.get(step.name)
         if prev is None:
@@ -397,9 +417,13 @@ def build_steps(config: PipelineConfig) -> list[Step]:
 def build_sdmx_pipeline(*,
                         config: PipelineConfig,
                         state: PipelineState,
-                        steps: Sequence[Step] | None = None) -> Pipeline:
+                        steps: Sequence[Step] | None = None,
+                        critical_input_hash: str | None = None) -> Pipeline:
     builder_steps = steps if steps is not None else build_steps(config)
-    builder = PipelineBuilder(config=config, state=state, steps=builder_steps)
+    builder = PipelineBuilder(config=config,
+                              state=state,
+                              steps=builder_steps,
+                              critical_input_hash=critical_input_hash)
     return builder.build()
 
 
@@ -458,12 +482,12 @@ def run_sdmx_pipeline(
         dataset_prefix=dataset_prefix,
     )
     state = state_handler.get_state()
+    # Snapshot state for planning so callback mutations do not affect scheduling.
+    state_snapshot = copy.deepcopy(state)
     critical_hash = _compute_critical_input_hash(config)
-    state.dataset_prefix = dataset_prefix
-    state.command = config.command
-    state.critical_input_hash = critical_hash
-    state_handler.save_state()
-    pipeline = build_sdmx_pipeline(config=config, state=state)
+    pipeline = build_sdmx_pipeline(config=config,
+                                   state=state_snapshot,
+                                   critical_input_hash=critical_hash)
     callback = build_pipeline_callback(
         state_handler=state_handler,
         dataset_prefix=dataset_prefix,
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 2007e854cb..6e0f77ec1c 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -499,6 +499,53 @@ def test_invalid_working_dir_raises(self) -> None:
                                     "working_dir is not a directory"):
             run_sdmx_pipeline(config=config)
 
+    def test_hash_change_forces_full_rerun(self) -> None:
+        config = self._build_config(dataset_prefix="demo",
+                                    dataflow="df.2",
+                                    command="sdmx rerun force")
+        first_clock = _IncrementingClock(
+            datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1))
+        run_sdmx_pipeline(config=config, now_fn=first_clock)
+
+        state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
+        with state_path.open(encoding="utf-8") as fp:
+            first_state = json.load(fp)
+
+        updated_config = dataclasses.replace(config, dataflow_key="changed-key")
+        second_clock = _IncrementingClock(
+            datetime(2025, 1, 5, tzinfo=timezone.utc), timedelta(seconds=1))
+        run_sdmx_pipeline(config=updated_config, now_fn=second_clock)
+
+        with state_path.open(encoding="utf-8") as fp:
+            second_state = json.load(fp)
+
+        self.assertNotEqual(first_state["critical_input_hash"],
+                            second_state["critical_input_hash"])
+        self.assertGreater(
+            second_state["steps"]["download-data"]["ended_at_ts"],
+            first_state["steps"]["download-data"]["ended_at_ts"])
+
+    def test_hash_unchanged_skips_rerun(self) -> None:
+        config = self._build_config(dataset_prefix="demo",
+                                    dataflow="df.3",
+                                    command="sdmx rerun noop")
+        initial_clock = _IncrementingClock(
+            datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1))
+        run_sdmx_pipeline(config=config, now_fn=initial_clock)
+
+        state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
+        with state_path.open(encoding="utf-8") as fp:
+            first_state = json.load(fp)
+
+        later_clock = _IncrementingClock(
+            datetime(2025, 1, 7, tzinfo=timezone.utc), timedelta(seconds=1))
+        run_sdmx_pipeline(config=config, now_fn=later_clock)
+
+        with state_path.open(encoding="utf-8") as fp:
+            second_state = json.load(fp)
+
+        self.assertEqual(first_state, second_state)
+
 
 if __name__ == "__main__":
     unittest.main()

From 69831c23099e0cb30b8573cc8c0e9554949531ff Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Fri, 14 Nov 2025 05:08:33 +0000
Subject: [PATCH 17/54] fix: add repo root to sys.path

Ensure sdmx pipeline can run via python tools/ without import errors.
---
 tools/agentic_import/sdmx_import_pipeline.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index aad7d6e0de..f28b2fc7da 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -29,6 +29,10 @@
 
 from absl import app, flags, logging
 
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
 from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
                                            PipelineAbort, PipelineCallback,
                                            PipelineRunner, RunnerConfig, Step)

From 95214cb29104f3820113357616d7b005d8b780ff Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Fri, 14 Nov 2025 10:32:48 +0000
Subject: [PATCH 18/54] chore: log SDMX pipeline step outcomes

Track per-step decisions so build/rerun reasons are recorded.
---
 tools/agentic_import/pipeline.py              |  12 +-
 tools/agentic_import/sdmx_import_pipeline.py  | 165 ++++++++++++++----
 .../sdmx_import_pipeline_test.py              |  36 +++-
 3 files changed, 171 insertions(+), 42 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index 6c489e89a4..bf4890b9de 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -121,7 +121,7 @@ def run(self,
         try:
             for step in steps:
                 current_step = step
-                logging.info(f"Preparing step {step.name} (v{step.version})")
+                logging.info(f"[STEP START] {step.name} (v{step.version})")
                 if callback:
                     callback.before_step(step)
                 error: Exception | None = None
@@ -129,13 +129,17 @@ def run(self,
                     step.run()
                 except Exception as exc:  # pylint: disable=broad-except
                     error = exc
-                    logging.exception(f"Step {step.name} failed")
+                    logging.exception(
+                        f"[STEP END] {step.name} (v{step.version}) status=failed"
+                    )
                     raise
                 finally:
                     if callback:
                         callback.after_step(step, error=error)
-                logging.info(f"Finished step {step.name}")
+                logging.info(
+                    f"[STEP END] {step.name} (v{step.version}) status=succeeded"
+                )
             logging.info("Pipeline completed")
         except PipelineAbort:
             name = current_step.name if current_step else "<none>"
-            logging.info(f"Pipeline aborted at {name}")
+            logging.info(f"[STEP END] {name} status=aborted; pipeline aborted")
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index f28b2fc7da..8f419dbf3c 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -25,7 +25,7 @@
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Callable, Sequence
+from typing import Callable, ClassVar, Sequence
 
 from absl import app, flags, logging
 
@@ -201,6 +201,26 @@ class PipelineConfig:
     skip_confirmation: bool = False
 
 
+@dataclass(frozen=True)
+class StepDecision:
+    """Represents whether a step will run and why."""
+
+    RUN: ClassVar[str] = "RUN"
+    SKIP: ClassVar[str] = "SKIP"
+
+    step_name: str
+    decision: str
+    reason: str
+
+
+@dataclass(frozen=True)
+class BuildResult:
+    """Output of planning that includes the pipeline and per-step decisions."""
+
+    pipeline: Pipeline
+    decisions: list[StepDecision]
+
+
 class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
@@ -334,45 +354,120 @@ def __init__(self,
         self._steps = steps
         self._critical_input_hash = critical_input_hash
 
-    def build(self) -> Pipeline:
+    def build(self) -> BuildResult:
         if self._config.run_only:
-            planned = self._filter_run_only(self._steps, self._config.run_only)
+            planned, decisions = self._plan_run_only(self._config.run_only)
         elif self._config.force:
             logging.info("Force flag set; scheduling all SDMX steps")
-            planned = list(self._steps)
+            planned, decisions = self._plan_all_steps(
+                "Force flag set; scheduling this step")
         elif self._hash_changed():
             logging.info("Critical inputs changed; scheduling all SDMX steps")
-            planned = list(self._steps)
+            planned, decisions = self._plan_all_steps(
+                "Critical inputs changed; scheduling this step")
         else:
-            planned = self._plan_steps()
+            planned, decisions = self._plan_incremental()
         logging.info("Built SDMX pipeline with %d steps", len(planned))
-        return Pipeline(steps=planned)
+        return BuildResult(pipeline=Pipeline(steps=planned),
+                           decisions=decisions)
+
+    def _plan_run_only(self,
+                       run_only: str) -> tuple[list[Step], list[StepDecision]]:
+        planned: list[Step] = []
+        decisions: list[StepDecision] = []
+        for step in self._steps:
+            if step.name == run_only:
+                planned.append(step)
+                decisions.append(
+                    StepDecision(
+                        step_name=step.name,
+                        decision=StepDecision.RUN,
+                        reason=(f"run_only={run_only} requested; running only "
+                                "this step"),
+                    ))
+            else:
+                decisions.append(
+                    StepDecision(
+                        step_name=step.name,
+                        decision=StepDecision.SKIP,
+                        reason=(f"run_only={run_only} requested; skipping "
+                                "this step"),
+                    ))
+        if not planned:
+            raise ValueError(f"run_only step not found: {run_only}")
+        return planned, decisions
 
-    def _plan_steps(self) -> list[Step]:
-        scheduled: list[Step] = []
+    def _plan_all_steps(self,
+                        reason: str) -> tuple[list[Step], list[StepDecision]]:
+        planned: list[Step] = []
+        decisions: list[StepDecision] = []
+        for step in self._steps:
+            planned.append(step)
+            decisions.append(
+                StepDecision(step_name=step.name,
+                             decision=StepDecision.RUN,
+                             reason=reason))
+        return planned, decisions
+
+    def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]:
+        planned: list[Step] = []
+        decisions: list[StepDecision] = []
         schedule_all_remaining = False
         previous: Step | None = None
         for step in self._steps:
             if schedule_all_remaining:
-                scheduled.append(step)
+                planned.append(step)
+                decisions.append(
+                    StepDecision(
+                        step_name=step.name,
+                        decision=StepDecision.RUN,
+                        reason=("Upstream step triggered rerun for remaining "
+                                "steps"),
+                    ))
+                previous = step
+                continue
+
+            prev_state = self._state.steps.get(step.name)
+            if prev_state is None:
+                needs_run = True
+                reason = "No previous state recorded; scheduling step"
+            elif prev_state.status != "succeeded":
+                needs_run = True
+                reason = (f"Previous run status was {prev_state.status}; "
+                          "rerunning step")
+            elif prev_state.version < step.version:
+                needs_run = True
+                reason = (
+                    f"Step version increased from {prev_state.version} to "
+                    f"{step.version}; rerunning step")
+            else:
+                needs_run = False
+                reason = ("Previous run succeeded with same version; step is "
+                          "up-to-date")
+
+            if not needs_run and previous is not None:
+                if self._predecessor_newer(previous, step):
+                    needs_run = True
+                    reason = (f"Previous step {previous.name} finished more "
+                              "recently; rerunning downstream steps")
+
+            if needs_run:
+                planned.append(step)
+                decisions.append(
+                    StepDecision(step_name=step.name,
+                                 decision=StepDecision.RUN,
+                                 reason=reason))
+                schedule_all_remaining = True
             else:
-                needs_run = self._should_run(step)
-                if not needs_run and previous is not None:
-                    needs_run = self._predecessor_newer(previous, step)
-                if needs_run:
-                    scheduled.append(step)
-                    schedule_all_remaining = True
+                decisions.append(
+                    StepDecision(step_name=step.name,
+                                 decision=StepDecision.SKIP,
+                                 reason=reason))
             previous = step
-        if not scheduled:
-            logging.info("No steps scheduled.")
-        return scheduled
 
-    def _filter_run_only(self, steps: Sequence[Step],
-                         run_only: str) -> list[Step]:
-        scoped = [s for s in steps if s.name == run_only]
-        if not scoped:
-            raise ValueError(f"run_only step not found: {run_only}")
-        return scoped
+        if not planned:
+            logging.info("No steps scheduled.")
+        return planned, decisions
 
     def _hash_changed(self) -> bool:
         if not self._critical_input_hash:
@@ -382,16 +477,6 @@ def _hash_changed(self) -> bool:
             return True
         return previous != self._critical_input_hash
 
-    def _should_run(self, step: Step) -> bool:
-        prev = self._state.steps.get(step.name)
-        if prev is None:
-            return True
-        if prev.status != "succeeded":
-            return True
-        if prev.version < step.version:
-            return True
-        return False
-
     def _predecessor_newer(self, prev_step: Step, step: Step) -> bool:
         prev_state = self._state.steps.get(prev_step.name)
         curr_state = self._state.steps.get(step.name)
@@ -418,6 +503,12 @@ def build_steps(config: PipelineConfig) -> list[Step]:
     ]
 
 
+def _log_step_decisions(decisions: Sequence[StepDecision]) -> None:
+    for decision in decisions:
+        logging.info("step=%s decision=%s reason=%s", decision.step_name,
+                     decision.decision, decision.reason)
+
+
 def build_sdmx_pipeline(*,
                         config: PipelineConfig,
                         state: PipelineState,
@@ -428,7 +519,9 @@ def build_sdmx_pipeline(*,
                               state=state,
                               steps=builder_steps,
                               critical_input_hash=critical_input_hash)
-    return builder.build()
+    result = builder.build()
+    _log_step_decisions(result.decisions)
+    return result.pipeline
 
 
 def _sanitize_run_id(dataflow: str) -> str:
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 6e0f77ec1c..326ef63336 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -45,7 +45,7 @@
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
-    build_pipeline_callback, build_sdmx_pipeline, build_steps,
+    StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
     run_sdmx_pipeline)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
@@ -333,7 +333,8 @@ def _names_from_builder(self,
         builder = PipelineBuilder(config=cfg,
                                   state=state or self._empty_state(),
                                   steps=builder_steps)
-        pipeline = builder.build()
+        result = builder.build()
+        pipeline = result.pipeline
         return [step.name for step in pipeline.get_steps()]
 
     def test_run_only_step(self) -> None:
@@ -382,6 +383,18 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
             "create-dc-config",
         ])
 
+    def test_force_branch_records_decisions(self) -> None:
+        cfg = PipelineConfig(command=_TEST_COMMAND, force=True)
+        steps = build_steps(cfg)
+        builder = PipelineBuilder(config=cfg,
+                                  state=self._empty_state(),
+                                  steps=steps)
+        result = builder.build()
+        self.assertEqual(len(result.decisions), len(steps))
+        for decision in result.decisions:
+            self.assertEqual(decision.decision, StepDecision.RUN)
+            self.assertIn("Force flag set", decision.reason)
+
     def test_run_only_ignores_timestamp_chaining(self) -> None:
         newer = 4_000
         older = 3_000
@@ -412,6 +425,25 @@ def test_version_bump_schedules_downstream(self) -> None:
         self.assertEqual([s.name for s in pipeline.get_steps()],
                          ["process-full-data", "create-dc-config"])
 
+    def test_incremental_records_skip_reasons(self) -> None:
+        state = self._state_with({
+            "download-data": (1, "succeeded", 1_000),
+            "download-metadata": (1, "succeeded", 1_000),
+            "create-sample": (1, "succeeded", 1_000),
+            "create-schema-mapping": (1, "succeeded", 1_000),
+            "process-full-data": (1, "succeeded", 1_000),
+            "create-dc-config": (1, "succeeded", 1_000),
+        })
+        cfg = PipelineConfig(command=_TEST_COMMAND)
+        steps = build_steps(cfg)
+        builder = PipelineBuilder(config=cfg, state=state, steps=steps)
+        result = builder.build()
+        self.assertFalse(result.pipeline.get_steps())
+        self.assertEqual(len(result.decisions), len(steps))
+        for decision in result.decisions:
+            self.assertEqual(decision.decision, StepDecision.SKIP)
+            self.assertIn("up-to-date", decision.reason)
+
 
 class RunPipelineTest(unittest.TestCase):
 

From 7400c0d88177a817c330f0572623577ec3b1a312 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 10:00:13 +0000
Subject: [PATCH 19/54] feat: implement metadata download step

---
 tools/agentic_import/sdmx_import_pipeline.py | 54 ++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 8f419dbf3c..4291663260 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -21,6 +21,7 @@
 import os
 import re
 import shlex
+import subprocess
 import sys
 from dataclasses import dataclass
 from datetime import datetime, timezone
@@ -33,6 +34,8 @@
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 
+SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
+
 from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
                                            PipelineAbort, PipelineCallback,
                                            PipelineRunner, RunnerConfig, Step)
@@ -42,6 +45,20 @@
 FLAGS = flags.FLAGS
 
 
+def _require_config_field(value: str | None, field: str,
+                          step_name: str) -> str:
+    if value:
+        return value
+    raise ValueError(f"{step_name} requires config.{field}")
+
+
+def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
+    command = [sys.executable, str(SDMX_CLI_PATH), *args]
+    if verbose:
+        logging.debug(f"Running SDMX CLI command: {' '.join(command)}")
+    subprocess.run(command, check=True)
+
+
 def _define_flags() -> None:
     flags.DEFINE_string("endpoint", None, "SDMX service endpoint.")
     flags.mark_flag_as_required("endpoint")
@@ -268,12 +285,43 @@ def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
 
     def run(self) -> None:
-        logging.info(
-            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+        endpoint = _require_config_field(self._config.endpoint, "endpoint",
+                                         self.name)
+        agency = _require_config_field(self._config.agency, "agency",
+                                       self.name)
+        dataflow = _require_config_field(self._config.dataflow, "dataflow",
+                                         self.name)
+        dataset_prefix = _resolve_dataset_prefix(self._config)
+        working_dir = _resolve_working_dir(self._config)
+        output_path = working_dir / f"{dataset_prefix}_metadata.xml"
+        if self._config.verbose:
+            logging.info(
+                f"Starting SDMX metadata download: endpoint={endpoint} "
+                f"agency={agency} dataflow={dataflow} -> {output_path}")
+        else:
+            logging.info(f"Downloading SDMX metadata to {output_path}")
+        args = [
+            "download-metadata",
+            f"--endpoint={endpoint}",
+            f"--agency={agency}",
+            f"--dataflow={dataflow}",
+            f"--output_path={output_path}",
+        ]
+        if self._config.verbose:
+            args.append("--verbose")
+        _run_sdmx_cli(args, verbose=self._config.verbose)
 
     def dry_run(self) -> None:
+        dataset_prefix = _resolve_dataset_prefix(self._config)
+        working_dir = Path(self._config.working_dir
+                           or os.getcwd()).resolve()
+        output_path = working_dir / f"{dataset_prefix}_metadata.xml"
+        endpoint = self._config.endpoint or "<missing>"
+        agency = self._config.agency or "<missing>"
+        dataflow = self._config.dataflow or "<missing>"
         logging.info(
-            f"{self.name} (dry run): previewing metadata download inputs")
+            f"{self.name} (dry run): would fetch endpoint={endpoint} "
+            f"agency={agency} dataflow={dataflow} -> {output_path}")
 
 
 class CreateSampleStep(SdmxStep):

From 3ea7f21aaca0b7cbcaaa8acb215b1b37fc1ec597 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 10:48:25 +0000
Subject: [PATCH 20/54] Refactor SDMX pipeline to use shared subprocess wrapper
 and cached command preparation

---
 tools/agentic_import/sdmx_import_pipeline.py  | 62 ++++++++-------
 .../sdmx_import_pipeline_test.py              | 75 ++++++++++++++++++-
 2 files changed, 111 insertions(+), 26 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 4291663260..f1f6cb90e5 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -23,7 +23,7 @@
 import shlex
 import subprocess
 import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Callable, ClassVar, Sequence
@@ -45,20 +45,30 @@
 FLAGS = flags.FLAGS
 
 
-def _require_config_field(value: str | None, field: str,
-                          step_name: str) -> str:
+def _require_config_field(value: str | None, field: str, step_name: str) -> str:
     if value:
         return value
     raise ValueError(f"{step_name} requires config.{field}")
 
 
-def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
-    command = [sys.executable, str(SDMX_CLI_PATH), *args]
+@dataclass(frozen=True)
+class CommandPlan:
+    """Holds a constructed command and its expected output path."""
+    full_command: list[str]
+    output_path: Path
+
+
+def _run_command(command: Sequence[str], *, verbose: bool) -> None:
     if verbose:
-        logging.debug(f"Running SDMX CLI command: {' '.join(command)}")
+        logging.debug(f"Running command: {' '.join(command)}")
     subprocess.run(command, check=True)
 
 
+def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
+    command = [sys.executable, str(SDMX_CLI_PATH), *args]
+    _run_command(command, verbose=verbose)
+
+
 def _define_flags() -> None:
     flags.DEFINE_string("endpoint", None, "SDMX service endpoint.")
     flags.mark_flag_as_required("endpoint")
@@ -283,23 +293,19 @@ class DownloadMetadataStep(SdmxStep):
 
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
+        self._plan: CommandPlan | None = None
 
-    def run(self) -> None:
+    def _prepare_command(self) -> CommandPlan:
+        if self._plan:
+            return self._plan
         endpoint = _require_config_field(self._config.endpoint, "endpoint",
                                          self.name)
-        agency = _require_config_field(self._config.agency, "agency",
-                                       self.name)
+        agency = _require_config_field(self._config.agency, "agency", self.name)
         dataflow = _require_config_field(self._config.dataflow, "dataflow",
                                          self.name)
         dataset_prefix = _resolve_dataset_prefix(self._config)
         working_dir = _resolve_working_dir(self._config)
         output_path = working_dir / f"{dataset_prefix}_metadata.xml"
-        if self._config.verbose:
-            logging.info(
-                f"Starting SDMX metadata download: endpoint={endpoint} "
-                f"agency={agency} dataflow={dataflow} -> {output_path}")
-        else:
-            logging.info(f"Downloading SDMX metadata to {output_path}")
         args = [
             "download-metadata",
             f"--endpoint={endpoint}",
@@ -309,19 +315,25 @@ def run(self) -> None:
         ]
         if self._config.verbose:
             args.append("--verbose")
-        _run_sdmx_cli(args, verbose=self._config.verbose)
+        full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
+        self._plan = CommandPlan(full_command=full_command,
+                                 output_path=output_path)
+        return self._plan
+
+    def run(self) -> None:
+        plan = self._prepare_command()
+        if self._config.verbose:
+            logging.info(
+                f"Starting SDMX metadata download: {' '.join(plan.full_command)} -> {plan.output_path}"
+            )
+        else:
+            logging.info(f"Downloading SDMX metadata to {plan.output_path}")
+        _run_command(plan.full_command, verbose=self._config.verbose)
 
     def dry_run(self) -> None:
-        dataset_prefix = _resolve_dataset_prefix(self._config)
-        working_dir = Path(self._config.working_dir
-                           or os.getcwd()).resolve()
-        output_path = working_dir / f"{dataset_prefix}_metadata.xml"
-        endpoint = self._config.endpoint or "<missing>"
-        agency = self._config.agency or "<missing>"
-        dataflow = self._config.dataflow or "<missing>"
+        plan = self._prepare_command()
         logging.info(
-            f"{self.name} (dry run): would fetch endpoint={endpoint} "
-            f"agency={agency} dataflow={dataflow} -> {output_path}")
+            f"{self.name} (dry run): would run {' '.join(plan.full_command)}")
 
 
 class CreateSampleStep(SdmxStep):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 326ef63336..88457391de 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -46,7 +46,7 @@
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
-    run_sdmx_pipeline)
+    run_sdmx_pipeline, DownloadMetadataStep, _run_command)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -463,6 +463,11 @@ def setUp(self) -> None:
         self._tmpdir_obj = tempfile.TemporaryDirectory()
         self.addCleanup(self._tmpdir_obj.cleanup)
         self._tmpdir = self._tmpdir_obj.name
+        # Mock _run_command to avoid actual execution during pipeline tests
+        self._run_command_patcher = mock.patch(
+            "tools.agentic_import.sdmx_import_pipeline._run_command")
+        self._mock_run_command = self._run_command_patcher.start()
+        self.addCleanup(self._run_command_patcher.stop)
 
     def test_run_pipeline_updates_state_and_hash(self) -> None:
         command = "sdmx run pipeline"
@@ -579,5 +584,73 @@ def test_hash_unchanged_skips_rerun(self) -> None:
         self.assertEqual(first_state, second_state)
 
 
+class SdmxStepTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self._tmpdir_obj = tempfile.TemporaryDirectory()
+        self.addCleanup(self._tmpdir_obj.cleanup)
+        self._tmpdir = self._tmpdir_obj.name
+
+    def test_run_command_logs_and_executes(self) -> None:
+        with mock.patch("subprocess.run") as mock_run:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="DEBUG") as logs:
+                _run_command(["echo", "hello"], verbose=True)
+
+            mock_run.assert_called_once_with(["echo", "hello"], check=True)
+            self.assertTrue(
+                any("Running command: echo hello" in entry
+                    for entry in logs.output))
+
+    def test_download_metadata_step_caches_plan(self) -> None:
+        config = PipelineConfig(command="test",
+                                endpoint="https://example.com",
+                                agency="AGENCY",
+                                dataflow="FLOW",
+                                dataset_prefix="demo",
+                                working_dir=self._tmpdir,
+                                verbose=True)
+        step = DownloadMetadataStep(name="test-step", config=config)
+
+        # First call creates plan
+        plan1 = step._prepare_command()
+        self.assertIn("download-metadata", plan1.full_command)
+        self.assertIn("--endpoint=https://example.com", plan1.full_command)
+
+        # Second call returns same object
+        plan2 = step._prepare_command()
+        self.assertIs(plan1, plan2)
+
+    def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
+        config = PipelineConfig(command="test",
+                                endpoint="https://example.com",
+                                agency="AGENCY",
+                                dataflow="FLOW",
+                                dataset_prefix="demo",
+                                working_dir=self._tmpdir,
+                                verbose=True)
+        step = DownloadMetadataStep(name="test-step", config=config)
+
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                step.dry_run()
+                step.run()
+
+            # Verify dry_run logged the command
+            self.assertTrue(
+                any("test-step (dry run): would run" in entry
+                    for entry in logs.output))
+            self.assertTrue(
+                any("download-metadata" in entry for entry in logs.output))
+
+            # Verify run called the command with the same args
+            mock_run_cmd.assert_called_once()
+            args, kwargs = mock_run_cmd.call_args
+            self.assertIn("download-metadata", args[0])
+            self.assertTrue(kwargs["verbose"])
+
+
 if __name__ == "__main__":
     unittest.main()

From c1936bd16c55a9881a95f89b38261587436b451b Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 11:12:27 +0000
Subject: [PATCH 21/54] Implement DownloadDataStep in sdmx_import_pipeline

---
 tools/agentic_import/sdmx_import_pipeline.py  | 44 ++++++++++++++-
 .../sdmx_import_pipeline_test.py              | 55 ++++++++++++++++++-
 2 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index f1f6cb90e5..d259a292d9 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -277,13 +277,51 @@ class DownloadDataStep(SdmxStep):
 
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
+        self._plan: CommandPlan | None = None
+
+    def _prepare_command(self) -> CommandPlan:
+        if self._plan:
+            return self._plan
+        endpoint = _require_config_field(self._config.endpoint, "endpoint",
+                                         self.name)
+        agency = _require_config_field(self._config.agency, "agency", self.name)
+        dataflow = _require_config_field(self._config.dataflow, "dataflow",
+                                         self.name)
+        dataset_prefix = _resolve_dataset_prefix(self._config)
+        working_dir = _resolve_working_dir(self._config)
+        output_path = working_dir / f"{dataset_prefix}_data.csv"
+        args = [
+            "download-data",
+            f"--endpoint={endpoint}",
+            f"--agency={agency}",
+            f"--dataflow={dataflow}",
+            f"--output_path={output_path}",
+        ]
+        if self._config.dataflow_key:
+            args.append(f"--key={self._config.dataflow_key}")
+        if self._config.dataflow_param:
+            args.append(f"--param={self._config.dataflow_param}")
+        if self._config.verbose:
+            args.append("--verbose")
+        full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
+        self._plan = CommandPlan(full_command=full_command,
+                                 output_path=output_path)
+        return self._plan
 
     def run(self) -> None:
-        logging.info(
-            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+        plan = self._prepare_command()
+        if self._config.verbose:
+            logging.info(
+                f"Starting SDMX data download: {' '.join(plan.full_command)} -> {plan.output_path}"
+            )
+        else:
+            logging.info(f"Downloading SDMX data to {plan.output_path}")
+        _run_command(plan.full_command, verbose=self._config.verbose)
 
     def dry_run(self) -> None:
-        logging.info(f"{self.name} (dry run): previewing data download inputs")
+        plan = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(plan.full_command)}")
 
 
 class DownloadMetadataStep(SdmxStep):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 88457391de..6de7f350e9 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -46,7 +46,7 @@
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
-    run_sdmx_pipeline, DownloadMetadataStep, _run_command)
+    run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, _run_command)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -651,6 +651,59 @@ def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertIn("download-metadata", args[0])
             self.assertTrue(kwargs["verbose"])
 
+    def test_download_data_step_caches_plan(self) -> None:
+        config = PipelineConfig(command="test",
+                                endpoint="https://example.com",
+                                agency="AGENCY",
+                                dataflow="FLOW",
+                                dataflow_key="test-key",
+                                dataflow_param="area=US",
+                                dataset_prefix="demo",
+                                working_dir=self._tmpdir,
+                                verbose=True)
+        step = DownloadDataStep(name="test-step", config=config)
+
+        # First call creates plan
+        plan1 = step._prepare_command()
+        self.assertIn("download-data", plan1.full_command)
+        self.assertIn("--endpoint=https://example.com", plan1.full_command)
+        self.assertIn("--key=test-key", plan1.full_command)
+        self.assertIn("--param=area=US", plan1.full_command)
+
+        # Second call returns same object
+        plan2 = step._prepare_command()
+        self.assertIs(plan1, plan2)
+
+    def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
+        config = PipelineConfig(command="test",
+                                endpoint="https://example.com",
+                                agency="AGENCY",
+                                dataflow="FLOW",
+                                dataset_prefix="demo",
+                                working_dir=self._tmpdir,
+                                verbose=True)
+        step = DownloadDataStep(name="test-step", config=config)
+
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                step.dry_run()
+                step.run()
+
+            # Verify dry_run logged the command
+            self.assertTrue(
+                any("test-step (dry run): would run" in entry
+                    for entry in logs.output))
+            self.assertTrue(
+                any("download-data" in entry for entry in logs.output))
+
+            # Verify run called the command with the same args
+            mock_run_cmd.assert_called_once()
+            args, kwargs = mock_run_cmd.call_args
+            self.assertIn("download-data", args[0])
+            self.assertTrue(kwargs["verbose"])
+
 
 if __name__ == "__main__":
     unittest.main()

From db4caa426c2a1209792e93499e059b22454bad4a Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 16:34:21 +0000
Subject: [PATCH 22/54] Refactor SDMX config resolution and align flag names

- Refactored SDMX configuration flags to hierarchical dot-notation (e.g., sdmx.dataflow.id).
- Introduced constants for flag names to reduce duplication.
- Aligned critical input hash keys with flag names.
- Implemented _resolve_config to handle dataset prefix and working dir resolution once.
- Updated steps to use pre-resolved configuration.
- Fixed tests and lint issues.
---
 tools/agentic_import/sdmx_import_pipeline.py  | 255 ++++++++++++------
 .../sdmx_import_pipeline_test.py              | 231 ++++++++++++----
 2 files changed, 348 insertions(+), 138 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index d259a292d9..654e2b1278 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -23,6 +23,7 @@
 import shlex
 import subprocess
 import sys
+import dataclasses
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
@@ -35,6 +36,15 @@
     sys.path.insert(0, str(REPO_ROOT))
 
 SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
+DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py"
+
+# Flag names
+_FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
+_FLAG_SDMX_AGENCY = "sdmx.agency"
+_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id"
+_FLAG_SDMX_DATAFLOW_KEY = "sdmx.dataflow.key"
+_FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param"
+_FLAG_SAMPLE_ROWS = "sample.rows"
 
 from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
                                            PipelineAbort, PipelineCallback,
@@ -70,22 +80,27 @@ def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
 
 
 def _define_flags() -> None:
-    flags.DEFINE_string("endpoint", None, "SDMX service endpoint.")
-    flags.mark_flag_as_required("endpoint")
+    flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
+    flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT)
 
-    flags.DEFINE_string("agency", None, "Owning SDMX agency identifier.")
-    flags.mark_flag_as_required("agency")
+    flags.DEFINE_string(_FLAG_SDMX_AGENCY, None,
+                        "Owning SDMX agency identifier.")
+    flags.mark_flag_as_required(_FLAG_SDMX_AGENCY)
 
-    flags.DEFINE_string("dataflow", None, "Target SDMX dataflow identifier.")
-    flags.mark_flag_as_required("dataflow")
+    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None,
+                        "Target SDMX dataflow identifier.")
+    flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID)
 
-    flags.DEFINE_string("dataflow_key", None, "Optional SDMX key or filter.")
-    flags.DEFINE_alias("key", "dataflow_key")
+    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None,
+                        "Optional SDMX key or filter.")
 
     flags.DEFINE_string(
-        "dataflow_param", None,
+        _FLAG_SDMX_DATAFLOW_PARAM, None,
         "Optional SDMX parameter appended to the dataflow query.")
 
+    flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000,
+                         "Number of rows to sample from downloaded data.")
+
     flags.DEFINE_string(
         "dataset_prefix", None,
         "Optional dataset prefix to override auto-derived values.")
@@ -207,27 +222,47 @@ def build_pipeline_callback(
 
 
 @dataclass(frozen=True)
-class PipelineConfig:
-    """User-configurable inputs that mimic planned CLI flags.
+class SdmxDataflowConfig:
+    """Configuration for SDMX dataflow."""
+    id: str | None = None
+    key: str | None = None
+    param: str | None = None
 
-    This is a lightweight container; CLI parsing will be added in a later
-    phase. Defaults are intentionally minimal.
-    """
 
-    command: str
+@dataclass(frozen=True)
+class SdmxConfig:
+    """Configuration for SDMX data access."""
     endpoint: str | None = None
     agency: str | None = None
-    dataflow: str | None = None
-    dataflow_key: str | None = None
-    dataflow_param: str | None = None
+    dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig)
+
+
+@dataclass(frozen=True)
+class SampleConfig:
+    """Configuration for data sampling."""
+    rows: int = 1000
+
+
+@dataclass(frozen=True)
+class RunConfig:
+    """Configuration for pipeline execution."""
+    command: str
     dataset_prefix: str | None = None
-    working_dir: str | None = None  # TODO: Add CLI flag once semantics stabilize.
+    working_dir: str | None = None
     run_only: str | None = None
     force: bool = False
     verbose: bool = False
     skip_confirmation: bool = False
 
 
+@dataclass(frozen=True)
+class PipelineConfig:
+    """Aggregated configuration for the pipeline."""
+    sdmx: SdmxConfig = field(default_factory=SdmxConfig)
+    sample: SampleConfig = field(default_factory=SampleConfig)
+    run: RunConfig = field(default_factory=lambda: RunConfig(command="python"))
+
+
 @dataclass(frozen=True)
 class StepDecision:
     """Represents whether a step will run and why."""
@@ -282,13 +317,14 @@ def __init__(self, *, name: str, config: PipelineConfig) -> None:
     def _prepare_command(self) -> CommandPlan:
         if self._plan:
             return self._plan
-        endpoint = _require_config_field(self._config.endpoint, "endpoint",
-                                         self.name)
-        agency = _require_config_field(self._config.agency, "agency", self.name)
-        dataflow = _require_config_field(self._config.dataflow, "dataflow",
-                                         self.name)
-        dataset_prefix = _resolve_dataset_prefix(self._config)
-        working_dir = _resolve_working_dir(self._config)
+        endpoint = _require_config_field(self._config.sdmx.endpoint,
+                                         _FLAG_SDMX_ENDPOINT, self.name)
+        agency = _require_config_field(self._config.sdmx.agency,
+                                       _FLAG_SDMX_AGENCY, self.name)
+        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
+                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir)
         output_path = working_dir / f"{dataset_prefix}_data.csv"
         args = [
             "download-data",
@@ -297,11 +333,11 @@ def _prepare_command(self) -> CommandPlan:
             f"--dataflow={dataflow}",
             f"--output_path={output_path}",
         ]
-        if self._config.dataflow_key:
-            args.append(f"--key={self._config.dataflow_key}")
-        if self._config.dataflow_param:
-            args.append(f"--param={self._config.dataflow_param}")
-        if self._config.verbose:
+        if self._config.sdmx.dataflow.key:
+            args.append(f"--key={self._config.sdmx.dataflow.key}")
+        if self._config.sdmx.dataflow.param:
+            args.append(f"--param={self._config.sdmx.dataflow.param}")
+        if self._config.run.verbose:
             args.append("--verbose")
         full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
         self._plan = CommandPlan(full_command=full_command,
@@ -310,13 +346,13 @@ def _prepare_command(self) -> CommandPlan:
 
     def run(self) -> None:
         plan = self._prepare_command()
-        if self._config.verbose:
+        if self._config.run.verbose:
             logging.info(
                 f"Starting SDMX data download: {' '.join(plan.full_command)} -> {plan.output_path}"
             )
         else:
             logging.info(f"Downloading SDMX data to {plan.output_path}")
-        _run_command(plan.full_command, verbose=self._config.verbose)
+        _run_command(plan.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
         plan = self._prepare_command()
@@ -336,13 +372,14 @@ def __init__(self, *, name: str, config: PipelineConfig) -> None:
     def _prepare_command(self) -> CommandPlan:
         if self._plan:
             return self._plan
-        endpoint = _require_config_field(self._config.endpoint, "endpoint",
-                                         self.name)
-        agency = _require_config_field(self._config.agency, "agency", self.name)
-        dataflow = _require_config_field(self._config.dataflow, "dataflow",
-                                         self.name)
-        dataset_prefix = _resolve_dataset_prefix(self._config)
-        working_dir = _resolve_working_dir(self._config)
+        endpoint = _require_config_field(self._config.sdmx.endpoint,
+                                         _FLAG_SDMX_ENDPOINT, self.name)
+        agency = _require_config_field(self._config.sdmx.agency,
+                                       _FLAG_SDMX_AGENCY, self.name)
+        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
+                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir)
         output_path = working_dir / f"{dataset_prefix}_metadata.xml"
         args = [
             "download-metadata",
@@ -351,7 +388,7 @@ def _prepare_command(self) -> CommandPlan:
             f"--dataflow={dataflow}",
             f"--output_path={output_path}",
         ]
-        if self._config.verbose:
+        if self._config.run.verbose:
             args.append("--verbose")
         full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
         self._plan = CommandPlan(full_command=full_command,
@@ -360,13 +397,13 @@ def _prepare_command(self) -> CommandPlan:
 
     def run(self) -> None:
         plan = self._prepare_command()
-        if self._config.verbose:
+        if self._config.run.verbose:
             logging.info(
                 f"Starting SDMX metadata download: {' '.join(plan.full_command)} -> {plan.output_path}"
             )
         else:
             logging.info(f"Downloading SDMX metadata to {plan.output_path}")
-        _run_command(plan.full_command, verbose=self._config.verbose)
+        _run_command(plan.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
         plan = self._prepare_command()
@@ -381,13 +418,52 @@ class CreateSampleStep(SdmxStep):
 
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
+        self._plan: CommandPlan | None = None
+
+    def _prepare_command(self) -> CommandPlan:
+        if self._plan:
+            return self._plan
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir)
+        input_path = working_dir / f"{dataset_prefix}_data.csv"
+        output_path = working_dir / f"{dataset_prefix}_sample.csv"
+
+        # Check input file existence before running, but allow plan creation.
+        # In a real run, this will fail early if download-data didn't run.
+        args = [
+            f"--sampler_input={input_path}",
+            f"--sampler_output={output_path}",
+            f"--sampler_output_rows={self._config.sample.rows}",
+        ]
+        full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args
+        self._plan = CommandPlan(full_command=full_command,
+                                 output_path=output_path)
+        return self._plan
 
     def run(self) -> None:
-        logging.info(
-            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+        plan = self._prepare_command()
+        # Find input path from command args
+        input_path_arg = next((arg for arg in plan.full_command
+                               if arg.startswith("--sampler_input=")), None)
+        if not input_path_arg:
+            raise RuntimeError("Could not find sampler_input in command")
+        input_path = Path(input_path_arg.split("=")[1])
+
+        if not input_path.is_file():
+            raise RuntimeError(f"Input file missing for sampling: {input_path}")
+
+        if self._config.run.verbose:
+            logging.info(
+                f"Starting data sampling: {' '.join(plan.full_command)} -> {plan.output_path}"
+            )
+        else:
+            logging.info(f"Sampling data to {plan.output_path}")
+        _run_command(plan.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
-        logging.info(f"{self.name} (dry run): previewing sample generation")
+        plan = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(plan.full_command)}")
 
 
 class CreateSchemaMapStep(SdmxStep):
@@ -453,9 +529,9 @@ def __init__(self,
         self._critical_input_hash = critical_input_hash
 
     def build(self) -> BuildResult:
-        if self._config.run_only:
-            planned, decisions = self._plan_run_only(self._config.run_only)
-        elif self._config.force:
+        if self._config.run.run_only:
+            planned, decisions = self._plan_run_only(self._config.run.run_only)
+        elif self._config.run.force:
             logging.info("Force flag set; scheduling all SDMX steps")
             planned, decisions = self._plan_all_steps(
                 "Force flag set; scheduling this step")
@@ -630,12 +706,13 @@ def _sanitize_run_id(dataflow: str) -> str:
 
 
 def _resolve_dataset_prefix(config: PipelineConfig) -> str:
-    if config.dataset_prefix:
-        return config.dataset_prefix
-    if not config.dataflow:
+    if config.run.dataset_prefix:
+        return config.run.dataset_prefix
+    if not config.sdmx.dataflow.id:
         raise ValueError(
-            "dataflow or dataset_prefix is required to derive dataset prefix")
-    sanitized = _sanitize_run_id(config.dataflow)
+            "dataflow.id or dataset_prefix is required to derive dataset prefix"
+        )
+    sanitized = _sanitize_run_id(config.sdmx.dataflow.id)
     if not sanitized:
         raise ValueError("dataflow value is invalid after sanitization")
     return sanitized
@@ -643,18 +720,18 @@ def _resolve_dataset_prefix(config: PipelineConfig) -> str:
 
 def _compute_critical_input_hash(config: PipelineConfig) -> str:
     payload = {
-        "agency": config.agency,
-        "dataflow": config.dataflow,
-        "endpoint": config.endpoint,
-        "dataflow_key": config.dataflow_key,
-        "dataflow_param": config.dataflow_param,
+        _FLAG_SDMX_AGENCY: config.sdmx.agency,
+        _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id,
+        _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint,
+        _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key,
+        _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param,
     }
     serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"))
     return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
 
 
 def _resolve_working_dir(config: PipelineConfig) -> Path:
-    directory = Path(config.working_dir or os.getcwd())
+    directory = Path(config.run.working_dir or os.getcwd())
     if directory.exists():
         if not directory.is_dir():
             raise ValueError(f"working_dir is not a directory: {directory}")
@@ -663,14 +740,25 @@ def _resolve_working_dir(config: PipelineConfig) -> Path:
     return directory
 
 
+def _resolve_config(config: PipelineConfig) -> PipelineConfig:
+    """Resolves dynamic configuration values and returns a new config."""
+    dataset_prefix = _resolve_dataset_prefix(config)
+    working_dir = _resolve_working_dir(config)
+    new_run = dataclasses.replace(config.run,
+                                  dataset_prefix=dataset_prefix,
+                                  working_dir=str(working_dir))
+    return dataclasses.replace(config, run=new_run)
+
+
 def run_sdmx_pipeline(
     *,
     config: PipelineConfig,
     now_fn: Callable[[], datetime] | None = None,
 ) -> None:
     """Orchestrates the SDMX pipeline for the provided configuration."""
-    working_dir = _resolve_working_dir(config)
-    dataset_prefix = _resolve_dataset_prefix(config)
+    resolved_config = _resolve_config(config)
+    working_dir = Path(resolved_config.run.working_dir)
+    dataset_prefix = resolved_config.run.dataset_prefix
     state_handler = StateHandler(
         state_path=working_dir / ".datacommons" /
         f"{dataset_prefix}.state.json",
@@ -679,19 +767,19 @@ def run_sdmx_pipeline(
     state = state_handler.get_state()
     # Snapshot state for planning so callback mutations do not affect scheduling.
     state_snapshot = copy.deepcopy(state)
-    critical_hash = _compute_critical_input_hash(config)
-    pipeline = build_sdmx_pipeline(config=config,
+    critical_hash = _compute_critical_input_hash(resolved_config)
+    pipeline = build_sdmx_pipeline(config=resolved_config,
                                    state=state_snapshot,
                                    critical_input_hash=critical_hash)
     callback = build_pipeline_callback(
         state_handler=state_handler,
         dataset_prefix=dataset_prefix,
         critical_input_hash=critical_hash,
-        command=config.command,
-        skip_confirmation=config.skip_confirmation,
+        command=resolved_config.run.command,
+        skip_confirmation=resolved_config.run.skip_confirmation,
         now_fn=now_fn,
     )
-    if config.verbose:
+    if resolved_config.run.verbose:
         logging.set_verbosity(logging.DEBUG)
     runner = PipelineRunner(RunnerConfig())
     runner.run(pipeline, callback)
@@ -699,20 +787,29 @@ def run_sdmx_pipeline(
 
 def prepare_config() -> PipelineConfig:
     """Builds PipelineConfig from CLI flags."""
+    # absl.flags doesn't support dots in attribute access easily,
+    # so we access the flag values directly from the flag names.
     command = shlex.join(sys.argv) if sys.argv else "python"
     return PipelineConfig(
-        command=command,
-        endpoint=FLAGS.endpoint,
-        agency=FLAGS.agency,
-        dataflow=FLAGS.dataflow,
-        dataflow_key=FLAGS.dataflow_key,
-        dataflow_param=FLAGS.dataflow_param,
-        dataset_prefix=FLAGS.dataset_prefix,
-        working_dir=None,
-        run_only=FLAGS.run_only,
-        force=FLAGS.force,
-        verbose=FLAGS.verbose,
-        skip_confirmation=FLAGS.skip_confirmation,
+        sdmx=SdmxConfig(
+            endpoint=FLAGS[_FLAG_SDMX_ENDPOINT].value,
+            agency=FLAGS[_FLAG_SDMX_AGENCY].value,
+            dataflow=SdmxDataflowConfig(
+                id=FLAGS[_FLAG_SDMX_DATAFLOW_ID].value,
+                key=FLAGS[_FLAG_SDMX_DATAFLOW_KEY].value,
+                param=FLAGS[_FLAG_SDMX_DATAFLOW_PARAM].value,
+            ),
+        ),
+        sample=SampleConfig(rows=FLAGS[_FLAG_SAMPLE_ROWS].value,),
+        run=RunConfig(
+            command=command,
+            dataset_prefix=FLAGS.dataset_prefix,
+            working_dir=None,
+            run_only=FLAGS.run_only,
+            force=FLAGS.force,
+            verbose=FLAGS.verbose,
+            skip_confirmation=FLAGS.skip_confirmation,
+        ),
     )
 
 
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 6de7f350e9..feae900e6d 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -46,7 +46,8 @@
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
-    run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, _run_command)
+    run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep,
+    _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -338,20 +339,23 @@ def _names_from_builder(self,
         return [step.name for step in pipeline.get_steps()]
 
     def test_run_only_step(self) -> None:
-        cfg_step = PipelineConfig(command=_TEST_COMMAND,
-                                  run_only="download-data")
+        cfg_step = PipelineConfig(
+            run=RunConfig(command=_TEST_COMMAND, run_only="download-data"))
         names_step = self._names_from_builder(cfg_step)
         self.assertEqual(names_step, ["download-data"])
 
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
             self._names_from_builder(
-                PipelineConfig(command=_TEST_COMMAND, run_only="nope"))
+                PipelineConfig(
+                    run=RunConfig(command=_TEST_COMMAND, run_only="nope")))
         with self.assertRaisesRegex(ValueError, "run_only step not found"):
             self._names_from_builder(
-                PipelineConfig(command=_TEST_COMMAND, run_only="download.nope"))
+                PipelineConfig(run=RunConfig(command=_TEST_COMMAND,
+                                             run_only="download.nope")))
 
     def test_force_semantics(self) -> None:
-        cfg_all = PipelineConfig(command=_TEST_COMMAND, force=True)
+        cfg_all = PipelineConfig(
+            run=RunConfig(command=_TEST_COMMAND, force=True))
         names_all = self._names_from_builder(cfg_all)
         self.assertEqual(names_all, [
             "download-data",
@@ -373,7 +377,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
             "process-full-data": (1, "succeeded", older),
             "create-dc-config": (1, "succeeded", older),
         })
-        cfg = PipelineConfig(command=_TEST_COMMAND)
+        cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND))
         names = self._names_from_builder(cfg, state=state)
         self.assertEqual(names, [
             "download-metadata",
@@ -384,7 +388,7 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
         ])
 
     def test_force_branch_records_decisions(self) -> None:
-        cfg = PipelineConfig(command=_TEST_COMMAND, force=True)
+        cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND, force=True))
         steps = build_steps(cfg)
         builder = PipelineBuilder(config=cfg,
                                   state=self._empty_state(),
@@ -402,7 +406,8 @@ def test_run_only_ignores_timestamp_chaining(self) -> None:
             "download-data": (1, "succeeded", newer),
             "download-metadata": (1, "succeeded", older),
         })
-        cfg = PipelineConfig(command=_TEST_COMMAND, run_only="download-data")
+        cfg = PipelineConfig(
+            run=RunConfig(command=_TEST_COMMAND, run_only="download-data"))
         names = self._names_from_builder(cfg, state=state)
         self.assertEqual(names, ["download-data"])
 
@@ -417,7 +422,7 @@ def test_version_bump_schedules_downstream(self) -> None:
             "process-full-data": (1, "succeeded", 1000),
             "create-dc-config": (1, "succeeded", 1000),
         })
-        cfg = PipelineConfig(command=_TEST_COMMAND)
+        cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND))
         names = self._names_from_builder(cfg, steps, state)
         self.assertEqual(names, ["process-full-data", "create-dc-config"])
 
@@ -434,7 +439,7 @@ def test_incremental_records_skip_reasons(self) -> None:
             "process-full-data": (1, "succeeded", 1_000),
             "create-dc-config": (1, "succeeded", 1_000),
         })
-        cfg = PipelineConfig(command=_TEST_COMMAND)
+        cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND))
         steps = build_steps(cfg)
         builder = PipelineBuilder(config=cfg, state=state, steps=steps)
         result = builder.build()
@@ -449,15 +454,23 @@ class RunPipelineTest(unittest.TestCase):
 
     def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None,
                       command: str) -> PipelineConfig:
-        return PipelineConfig(endpoint="https://api.example.com",
-                              agency="TEST_AGENCY",
-                              dataflow=dataflow,
-                              dataflow_key="test-key",
-                              dataflow_param="area=US",
-                              dataset_prefix=dataset_prefix,
-                              working_dir=self._tmpdir,
-                              skip_confirmation=True,
-                              command=command)
+        return PipelineConfig(
+            sdmx=SdmxConfig(
+                endpoint="https://api.example.com",
+                agency="TEST_AGENCY",
+                dataflow=SdmxDataflowConfig(
+                    id=dataflow,
+                    key="test-key",
+                    param="area=US",
+                ),
+            ),
+            run=RunConfig(
+                dataset_prefix=dataset_prefix,
+                working_dir=self._tmpdir,
+                skip_confirmation=True,
+                command=command,
+            ),
+        )
 
     def setUp(self) -> None:
         self._tmpdir_obj = tempfile.TemporaryDirectory()
@@ -477,6 +490,9 @@ def test_run_pipeline_updates_state_and_hash(self) -> None:
         clock = _IncrementingClock(datetime(2025, 1, 2, tzinfo=timezone.utc),
                                    timedelta(seconds=2))
 
+        # Create dummy input file for sampling
+        (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
+
         run_sdmx_pipeline(config=config, now_fn=clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
@@ -487,11 +503,11 @@ def test_run_pipeline_updates_state_and_hash(self) -> None:
         expected_hash = hashlib.sha256(
             json.dumps(
                 {
-                    "agency": config.agency,
-                    "dataflow": config.dataflow,
-                    "endpoint": config.endpoint,
-                    "dataflow_key": config.dataflow_key,
-                    "dataflow_param": config.dataflow_param,
+                    "sdmx.agency": config.sdmx.agency,
+                    "sdmx.dataflow.id": config.sdmx.dataflow.id,
+                    "sdmx.endpoint": config.sdmx.endpoint,
+                    "sdmx.dataflow.key": config.sdmx.dataflow.key,
+                    "sdmx.dataflow.param": config.sdmx.dataflow.param,
                 },
                 sort_keys=True,
                 separators=(",", ":")).encode("utf-8")).hexdigest()
@@ -513,6 +529,9 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
         config = self._build_config(dataset_prefix=None,
                                     dataflow=dataflow,
                                     command="sdmx run sanitized")
+        # Create dummy input file for sampling (sanitized name)
+        (Path(self._tmpdir) /
+         "my_flow_name_2025_data.csv").write_text("header\nrow1")
         run_sdmx_pipeline(config=config,
                           now_fn=_IncrementingClock(
                               datetime(2025, 1, 3, tzinfo=timezone.utc),
@@ -529,9 +548,12 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
     def test_invalid_working_dir_raises(self) -> None:
         path = Path(self._tmpdir) / "not_a_dir"
         path.write_text("content")
-        config = dataclasses.replace(self._build_config(
-            dataset_prefix="demo", dataflow="df", command="sdmx run invalid"),
-                                     working_dir=str(path))
+        base_config = self._build_config(dataset_prefix="demo",
+                                         dataflow="df",
+                                         command="sdmx run invalid")
+        updated_run = dataclasses.replace(base_config.run,
+                                          working_dir=str(path))
+        config = dataclasses.replace(base_config, run=updated_run)
         with self.assertRaisesRegex(ValueError,
                                     "working_dir is not a directory"):
             run_sdmx_pipeline(config=config)
@@ -542,13 +564,19 @@ def test_hash_change_forces_full_rerun(self) -> None:
                                     command="sdmx rerun force")
         first_clock = _IncrementingClock(
             datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1))
+        # Create dummy input file for sampling
+        (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
         run_sdmx_pipeline(config=config, now_fn=first_clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
         with state_path.open(encoding="utf-8") as fp:
             first_state = json.load(fp)
 
-        updated_config = dataclasses.replace(config, dataflow_key="changed-key")
+        updated_dataflow = dataclasses.replace(config.sdmx.dataflow,
+                                               key="changed-key")
+        updated_sdmx = dataclasses.replace(config.sdmx,
+                                           dataflow=updated_dataflow)
+        updated_config = dataclasses.replace(config, sdmx=updated_sdmx)
         second_clock = _IncrementingClock(
             datetime(2025, 1, 5, tzinfo=timezone.utc), timedelta(seconds=1))
         run_sdmx_pipeline(config=updated_config, now_fn=second_clock)
@@ -568,6 +596,8 @@ def test_hash_unchanged_skips_rerun(self) -> None:
                                     command="sdmx rerun noop")
         initial_clock = _IncrementingClock(
             datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1))
+        # Create dummy input file for sampling
+        (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
         run_sdmx_pipeline(config=config, now_fn=initial_clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
@@ -603,13 +633,19 @@ def test_run_command_logs_and_executes(self) -> None:
                     for entry in logs.output))
 
     def test_download_metadata_step_caches_plan(self) -> None:
-        config = PipelineConfig(command="test",
-                                endpoint="https://example.com",
-                                agency="AGENCY",
-                                dataflow="FLOW",
-                                dataset_prefix="demo",
-                                working_dir=self._tmpdir,
-                                verbose=True)
+        config = PipelineConfig(
+            sdmx=SdmxConfig(
+                endpoint="https://example.com",
+                agency="AGENCY",
+                dataflow=SdmxDataflowConfig(id="FLOW"),
+            ),
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+        )
         step = DownloadMetadataStep(name="test-step", config=config)
 
         # First call creates plan
@@ -622,13 +658,19 @@ def test_download_metadata_step_caches_plan(self) -> None:
         self.assertIs(plan1, plan2)
 
     def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
-        config = PipelineConfig(command="test",
-                                endpoint="https://example.com",
-                                agency="AGENCY",
-                                dataflow="FLOW",
-                                dataset_prefix="demo",
-                                working_dir=self._tmpdir,
-                                verbose=True)
+        config = PipelineConfig(
+            sdmx=SdmxConfig(
+                endpoint="https://example.com",
+                agency="AGENCY",
+                dataflow=SdmxDataflowConfig(id="FLOW"),
+            ),
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+        )
         step = DownloadMetadataStep(name="test-step", config=config)
 
         with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
@@ -652,15 +694,23 @@ def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertTrue(kwargs["verbose"])
 
     def test_download_data_step_caches_plan(self) -> None:
-        config = PipelineConfig(command="test",
-                                endpoint="https://example.com",
-                                agency="AGENCY",
-                                dataflow="FLOW",
-                                dataflow_key="test-key",
-                                dataflow_param="area=US",
-                                dataset_prefix="demo",
-                                working_dir=self._tmpdir,
-                                verbose=True)
+        config = PipelineConfig(
+            sdmx=SdmxConfig(
+                endpoint="https://example.com",
+                agency="AGENCY",
+                dataflow=SdmxDataflowConfig(
+                    id="FLOW",
+                    key="test-key",
+                    param="area=US",
+                ),
+            ),
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+        )
         step = DownloadDataStep(name="test-step", config=config)
 
         # First call creates plan
@@ -675,13 +725,19 @@ def test_download_data_step_caches_plan(self) -> None:
         self.assertIs(plan1, plan2)
 
     def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
-        config = PipelineConfig(command="test",
-                                endpoint="https://example.com",
-                                agency="AGENCY",
-                                dataflow="FLOW",
-                                dataset_prefix="demo",
-                                working_dir=self._tmpdir,
-                                verbose=True)
+        config = PipelineConfig(
+            sdmx=SdmxConfig(
+                endpoint="https://example.com",
+                agency="AGENCY",
+                dataflow=SdmxDataflowConfig(id="FLOW"),
+            ),
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+        )
         step = DownloadDataStep(name="test-step", config=config)
 
         with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
@@ -704,6 +760,63 @@ def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertIn("download-data", args[0])
             self.assertTrue(kwargs["verbose"])
 
+    def test_create_sample_step_caches_plan(self) -> None:
+        config = PipelineConfig(
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+            sample=SampleConfig(rows=500),
+        )
+        step = CreateSampleStep(name="test-step", config=config)
+
+        # First call creates plan
+        plan1 = step._prepare_command()
+        self.assertIn("data_sampler.py", plan1.full_command[1])
+        self.assertIn("--sampler_output_rows=500", plan1.full_command)
+
+        # Second call returns same object
+        plan2 = step._prepare_command()
+        self.assertIs(plan1, plan2)
+
+    def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
+        config = PipelineConfig(
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+            sample=SampleConfig(rows=500),
+        )
+        step = CreateSampleStep(name="test-step", config=config)
+
+        # Create dummy input file
+        input_path = Path(self._tmpdir) / "demo_data.csv"
+        input_path.write_text("header\nrow1")
+
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                step.dry_run()
+                step.run()
+
+            # Verify dry_run logged the command
+            self.assertTrue(
+                any("test-step (dry run): would run" in entry
+                    for entry in logs.output))
+            self.assertTrue(
+                any("data_sampler.py" in entry for entry in logs.output))
+
+            # Verify run called the command with the same args
+            mock_run_cmd.assert_called_once()
+            args, kwargs = mock_run_cmd.call_args
+            self.assertIn("data_sampler.py", args[0][1])
+            self.assertTrue(kwargs["verbose"])
+
 
 if __name__ == "__main__":
     unittest.main()

From fc7c2e4e4100b57a5bf1540dc5ebae9056a70407 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 17:33:04 +0000
Subject: [PATCH 23/54] feat: Add early input file existence check to
 `CreateSampleStep` and refactor command preparation using `_StepContext`
 dataclass, with corresponding test updates.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 49 +++++++++----------
 .../sdmx_import_pipeline_test.py              | 21 ++++++++
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 654e2b1278..2ffe77df8b 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -416,54 +416,53 @@ class CreateSampleStep(SdmxStep):
 
     VERSION = 1
 
+    @dataclass(frozen=True)
+    class _StepContext:
+        input_path: Path
+        full_command: list[str]
+        output_path: Path
+
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
-        self._plan: CommandPlan | None = None
+        self._context: CreateSampleStep._StepContext | None = None
 
-    def _prepare_command(self) -> CommandPlan:
-        if self._plan:
-            return self._plan
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
         dataset_prefix = self._config.run.dataset_prefix
         working_dir = Path(self._config.run.working_dir)
         input_path = working_dir / f"{dataset_prefix}_data.csv"
         output_path = working_dir / f"{dataset_prefix}_sample.csv"
 
-        # Check input file existence before running, but allow plan creation.
-        # In a real run, this will fail early if download-data didn't run.
+        if not input_path.is_file():
+            raise RuntimeError(f"Input file missing for sampling: {input_path}")
+
         args = [
             f"--sampler_input={input_path}",
             f"--sampler_output={output_path}",
             f"--sampler_output_rows={self._config.sample.rows}",
         ]
         full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args
-        self._plan = CommandPlan(full_command=full_command,
-                                 output_path=output_path)
-        return self._plan
+        self._context = CreateSampleStep._StepContext(input_path=input_path,
+                                                      full_command=full_command,
+                                                      output_path=output_path)
+        return self._context
 
     def run(self) -> None:
-        plan = self._prepare_command()
-        # Find input path from command args
-        input_path_arg = next((arg for arg in plan.full_command
-                               if arg.startswith("--sampler_input=")), None)
-        if not input_path_arg:
-            raise RuntimeError("Could not find sampler_input in command")
-        input_path = Path(input_path_arg.split("=")[1])
-
-        if not input_path.is_file():
-            raise RuntimeError(f"Input file missing for sampling: {input_path}")
-
+        context = self._prepare_command()
         if self._config.run.verbose:
             logging.info(
-                f"Starting data sampling: {' '.join(plan.full_command)} -> {plan.output_path}"
+                f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}"
             )
         else:
-            logging.info(f"Sampling data to {plan.output_path}")
-        _run_command(plan.full_command, verbose=self._config.run.verbose)
+            logging.info(f"Sampling data to {context.output_path}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
-        plan = self._prepare_command()
+        context = self._prepare_command()
         logging.info(
-            f"{self.name} (dry run): would run {' '.join(plan.full_command)}")
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
 
 
 class CreateSchemaMapStep(SdmxStep):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index feae900e6d..a1be47e6c2 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -772,6 +772,10 @@ def test_create_sample_step_caches_plan(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
 
+        # Create dummy input file to satisfy validation
+        input_path = Path(self._tmpdir) / "demo_data.csv"
+        input_path.write_text("header\nrow1")
+
         # First call creates plan
         plan1 = step._prepare_command()
         self.assertIn("data_sampler.py", plan1.full_command[1])
@@ -817,6 +821,23 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertIn("data_sampler.py", args[0][1])
             self.assertTrue(kwargs["verbose"])
 
+    def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None:
+        config = PipelineConfig(
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+            sample=SampleConfig(rows=500),
+        )
+        step = CreateSampleStep(name="test-step", config=config)
+        # No input file created
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Input file missing for sampling"):
+            step.dry_run()
+
 
 if __name__ == "__main__":
     unittest.main()

From e7c8db537de8e00c950e822da649d8f1a6debc53 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 17:37:45 +0000
Subject: [PATCH 24/54] refactor: Replace `CommandPlan` dataclass with a nested
 `_StepContext` and update all related references in pipeline steps and tests.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 71 ++++++++++---------
 .../sdmx_import_pipeline_test.py              | 40 +++++------
 2 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 2ffe77df8b..8abac73aaa 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -61,13 +61,6 @@ def _require_config_field(value: str | None, field: str, step_name: str) -> str:
     raise ValueError(f"{step_name} requires config.{field}")
 
 
-@dataclass(frozen=True)
-class CommandPlan:
-    """Holds a constructed command and its expected output path."""
-    full_command: list[str]
-    output_path: Path
-
-
 def _run_command(command: Sequence[str], *, verbose: bool) -> None:
     if verbose:
         logging.debug(f"Running command: {' '.join(command)}")
@@ -310,13 +303,18 @@ class DownloadDataStep(SdmxStep):
 
     VERSION = 1
 
+    @dataclass(frozen=True)
+    class _StepContext:
+        full_command: list[str]
+        output_path: Path
+
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
-        self._plan: CommandPlan | None = None
+        self._context: DownloadDataStep._StepContext | None = None
 
-    def _prepare_command(self) -> CommandPlan:
-        if self._plan:
-            return self._plan
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
         endpoint = _require_config_field(self._config.sdmx.endpoint,
                                          _FLAG_SDMX_ENDPOINT, self.name)
         agency = _require_config_field(self._config.sdmx.agency,
@@ -340,24 +338,25 @@ def _prepare_command(self) -> CommandPlan:
         if self._config.run.verbose:
             args.append("--verbose")
         full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
-        self._plan = CommandPlan(full_command=full_command,
-                                 output_path=output_path)
-        return self._plan
+        self._context = DownloadDataStep._StepContext(full_command=full_command,
+                                                      output_path=output_path)
+        return self._context
 
     def run(self) -> None:
-        plan = self._prepare_command()
+        context = self._prepare_command()
         if self._config.run.verbose:
             logging.info(
-                f"Starting SDMX data download: {' '.join(plan.full_command)} -> {plan.output_path}"
+                f"Starting SDMX data download: {' '.join(context.full_command)} -> {context.output_path}"
             )
         else:
-            logging.info(f"Downloading SDMX data to {plan.output_path}")
-        _run_command(plan.full_command, verbose=self._config.run.verbose)
+            logging.info(f"Downloading SDMX data to {context.output_path}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
-        plan = self._prepare_command()
+        context = self._prepare_command()
         logging.info(
-            f"{self.name} (dry run): would run {' '.join(plan.full_command)}")
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
 
 
 class DownloadMetadataStep(SdmxStep):
@@ -365,13 +364,18 @@ class DownloadMetadataStep(SdmxStep):
 
     VERSION = 1
 
+    @dataclass(frozen=True)
+    class _StepContext:
+        full_command: list[str]
+        output_path: Path
+
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
-        self._plan: CommandPlan | None = None
+        self._context: DownloadMetadataStep._StepContext | None = None
 
-    def _prepare_command(self) -> CommandPlan:
-        if self._plan:
-            return self._plan
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
         endpoint = _require_config_field(self._config.sdmx.endpoint,
                                          _FLAG_SDMX_ENDPOINT, self.name)
         agency = _require_config_field(self._config.sdmx.agency,
@@ -391,24 +395,25 @@ def _prepare_command(self) -> CommandPlan:
         if self._config.run.verbose:
             args.append("--verbose")
         full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
-        self._plan = CommandPlan(full_command=full_command,
-                                 output_path=output_path)
-        return self._plan
+        self._context = DownloadMetadataStep._StepContext(
+            full_command=full_command, output_path=output_path)
+        return self._context
 
     def run(self) -> None:
-        plan = self._prepare_command()
+        context = self._prepare_command()
         if self._config.run.verbose:
             logging.info(
-                f"Starting SDMX metadata download: {' '.join(plan.full_command)} -> {plan.output_path}"
+                f"Starting SDMX metadata download: {' '.join(context.full_command)} -> {context.output_path}"
             )
         else:
-            logging.info(f"Downloading SDMX metadata to {plan.output_path}")
-        _run_command(plan.full_command, verbose=self._config.run.verbose)
+            logging.info(f"Downloading SDMX metadata to {context.output_path}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
-        plan = self._prepare_command()
+        context = self._prepare_command()
         logging.info(
-            f"{self.name} (dry run): would run {' '.join(plan.full_command)}")
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
 
 
 class CreateSampleStep(SdmxStep):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index a1be47e6c2..a8bf7fd265 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -648,14 +648,14 @@ def test_download_metadata_step_caches_plan(self) -> None:
         )
         step = DownloadMetadataStep(name="test-step", config=config)
 
-        # First call creates plan
-        plan1 = step._prepare_command()
-        self.assertIn("download-metadata", plan1.full_command)
-        self.assertIn("--endpoint=https://example.com", plan1.full_command)
+        # First call creates context
+        context1 = step._prepare_command()
+        self.assertIn("download-metadata", context1.full_command)
+        self.assertIn("--endpoint=https://example.com", context1.full_command)
 
         # Second call returns same object
-        plan2 = step._prepare_command()
-        self.assertIs(plan1, plan2)
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
 
     def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -713,16 +713,16 @@ def test_download_data_step_caches_plan(self) -> None:
         )
         step = DownloadDataStep(name="test-step", config=config)
 
-        # First call creates plan
-        plan1 = step._prepare_command()
-        self.assertIn("download-data", plan1.full_command)
-        self.assertIn("--endpoint=https://example.com", plan1.full_command)
-        self.assertIn("--key=test-key", plan1.full_command)
-        self.assertIn("--param=area=US", plan1.full_command)
+        # First call creates context
+        context1 = step._prepare_command()
+        self.assertIn("download-data", context1.full_command)
+        self.assertIn("--endpoint=https://example.com", context1.full_command)
+        self.assertIn("--key=test-key", context1.full_command)
+        self.assertIn("--param=area=US", context1.full_command)
 
         # Second call returns same object
-        plan2 = step._prepare_command()
-        self.assertIs(plan1, plan2)
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
 
     def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -776,14 +776,14 @@ def test_create_sample_step_caches_plan(self) -> None:
         input_path = Path(self._tmpdir) / "demo_data.csv"
         input_path.write_text("header\nrow1")
 
-        # First call creates plan
-        plan1 = step._prepare_command()
-        self.assertIn("data_sampler.py", plan1.full_command[1])
-        self.assertIn("--sampler_output_rows=500", plan1.full_command)
+        # First call creates context
+        context1 = step._prepare_command()
+        self.assertIn("data_sampler.py", context1.full_command[1])
+        self.assertIn("--sampler_output_rows=500", context1.full_command)
 
         # Second call returns same object
-        plan2 = step._prepare_command()
-        self.assertIs(plan1, plan2)
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
 
     def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(

From 9a7e38d720d119b3808fb442ed289c39f0638492 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Tue, 25 Nov 2025 17:49:11 +0000
Subject: [PATCH 25/54] refactor: move `dry_run` from the general `Step`
 interface to `SdmxStep` and update related tests and callback logic.

---
 tools/agentic_import/pipeline.py              |  4 ----
 tools/agentic_import/pipeline_test.py         |  6 ------
 tools/agentic_import/sdmx_import_pipeline.py  | 10 +++++++---
 .../sdmx_import_pipeline_test.py              | 19 ++++++++++++-------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index bf4890b9de..09b22a94dd 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -44,10 +44,6 @@ def version(self) -> int:
     def run(self) -> None:
         """Execute the step."""
 
-    @abc.abstractmethod
-    def dry_run(self) -> None:
-        """Log a read-only preview of the work to be done."""
-
 
 class BaseStep(Step, abc.ABC):
     """Helper base class that stores mandatory metadata."""
diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index ee19777ef3..52944546f1 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -39,9 +39,6 @@ def run(self) -> None:
         self.executed = True
         self._events.append(f"run:{self.name}")
 
-    def dry_run(self) -> None:
-        return None
-
 
 class _FailingStep(BaseStep):
 
@@ -51,9 +48,6 @@ def __init__(self, *, name: str, version: int) -> None:
     def run(self) -> None:
         raise ValueError("boom")
 
-    def dry_run(self) -> None:
-        return None
-
 
 class PipelineRunnerTest(unittest.TestCase):
 
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 8abac73aaa..5fd8e02f79 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import abc
 import copy
 import hashlib
 import json
@@ -119,8 +120,9 @@ class InteractiveCallback(PipelineCallback):
     """Prompts the user before each step runs."""
 
     def before_step(self, step: Step) -> None:
-        logging.info(f"Dry run for {step.name} (v{step.version}):")
-        step.dry_run()
+        if isinstance(step, SdmxStep):
+            logging.info(f"Dry run for {step.name} (v{step.version}):")
+            step.dry_run()
         prompt = f"Run step {step.name} (v{step.version})? [Y/n] "
         response = input(prompt).strip().lower()
         if response in ("n", "no"):
@@ -295,7 +297,9 @@ def name(self) -> str:
     def version(self) -> int:
         return self._version
 
-    # Subclasses must implement run() and dry_run().
+    @abc.abstractmethod
+    def dry_run(self) -> None:
+        """Log a read-only preview of the work to be done."""
 
 
 class DownloadDataStep(SdmxStep):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index a8bf7fd265..6431dd0224 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -47,10 +47,13 @@
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
     run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep,
-    _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig)
+    _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig,
+    SdmxStep)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
+_DUMMY_CONFIG = PipelineConfig(run=RunConfig(command="test"))
+
 
 class _IncrementingClock:
 
@@ -67,10 +70,10 @@ def __call__(self) -> datetime:
         return self._value
 
 
-class _RecordingStep(BaseStep):
+class _RecordingStep(SdmxStep):
 
     def __init__(self, name: str, *, should_fail: bool = False) -> None:
-        super().__init__(name=name, version=1)
+        super().__init__(name=name, version=1, config=_DUMMY_CONFIG)
         self._should_fail = should_fail
 
     def run(self) -> None:
@@ -81,10 +84,10 @@ def dry_run(self) -> None:
         logging.info("noop")
 
 
-class _VersionedStep(BaseStep):
+class _VersionedStep(SdmxStep):
 
     def __init__(self, name: str, version: int) -> None:
-        super().__init__(name=name, version=version)
+        super().__init__(name=name, version=version, config=_DUMMY_CONFIG)
 
     def run(self) -> None:
         logging.info("noop")
@@ -196,10 +199,12 @@ def test_abort_skips_state_persistence(self) -> None:
                 json.dump(previous, fp)
             callback, handler = self._build_callback(tmpdir=tmpdir, clock=clock)
 
-            class _AbortStep(BaseStep):
+            class _AbortStep(SdmxStep):
 
                 def __init__(self) -> None:
-                    super().__init__(name="download.download-data", version=1)
+                    super().__init__(name="download.download-data",
+                                     version=1,
+                                     config=_DUMMY_CONFIG)
 
                 def run(self) -> None:
                     raise PipelineAbort("user requested stop")

From 8fed16a33245a5b886338ea0b24ed31b61213d99 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 06:59:03 +0000
Subject: [PATCH 26/54] feat: Enable CreateSchemaMapStep to execute
 pvmap_generator.py and add gemini_cli flag.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 56 +++++++++++-
 .../sdmx_import_pipeline_test.py              | 90 ++++++++++++++++++-
 2 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 5fd8e02f79..f7c7bf5850 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -38,6 +38,7 @@
 
 SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
 DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py"
+PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py"
 
 # Flag names
 _FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
@@ -109,6 +110,9 @@ def _define_flags() -> None:
     flags.DEFINE_boolean("skip_confirmation", False,
                          "Skip interactive confirmation prompts.")
 
+    flags.DEFINE_string("gemini_cli", "gemini",
+                        "Path to Gemini CLI executable.")
+
 
 def _format_time(value: datetime) -> str:
     if value.tzinfo is None:
@@ -248,6 +252,7 @@ class RunConfig:
     force: bool = False
     verbose: bool = False
     skip_confirmation: bool = False
+    gemini_cli: str | None = None
 
 
 @dataclass(frozen=True)
@@ -479,16 +484,62 @@ class CreateSchemaMapStep(SdmxStep):
 
     VERSION = 1
 
+    @dataclass(frozen=True)
+    class _StepContext:
+        sample_path: Path
+        metadata_path: Path
+        output_prefix: Path
+        full_command: list[str]
+
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: CreateSchemaMapStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir)
+        sample_path = working_dir / f"{dataset_prefix}_sample.csv"
+        metadata_path = working_dir / f"{dataset_prefix}_metadata.xml"
+        output_prefix = working_dir / dataset_prefix
+
+        if not sample_path.is_file():
+            raise RuntimeError(f"Sample file missing: {sample_path}")
+        if not metadata_path.is_file():
+            raise RuntimeError(f"Metadata file missing: {metadata_path}")
+
+        args = [
+            f"--input_data={sample_path}",
+            f"--input_metadata={metadata_path}",
+            "--sdmx_dataset",
+            f"--output_path={output_prefix}",
+        ]
+        if self._config.run.skip_confirmation:
+            args.append("--skip_confirmation")
+        if self._config.run.gemini_cli:
+            args.append(f"--gemini_cli={self._config.run.gemini_cli}")
+
+        full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args
+        self._context = CreateSchemaMapStep._StepContext(
+            sample_path=sample_path,
+            metadata_path=metadata_path,
+            output_prefix=output_prefix,
+            full_command=full_command)
+        return self._context
 
     def run(self) -> None:
+        context = self._prepare_command()
         logging.info(
-            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+            f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}"
+        )
+        _run_command(context.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
+        context = self._prepare_command()
         logging.info(
-            f"{self.name} (dry run): previewing schema mapping outputs")
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
 
 
 class ProcessFullDataStep(SdmxStep):
@@ -817,6 +868,7 @@ def prepare_config() -> PipelineConfig:
             force=FLAGS.force,
             verbose=FLAGS.verbose,
             skip_confirmation=FLAGS.skip_confirmation,
+            gemini_cli=FLAGS.gemini_cli,
         ),
     )
 
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 6431dd0224..37314e7eac 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -47,8 +47,8 @@
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
     run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep,
-    _run_command, SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig,
-    SdmxStep)
+    CreateSchemaMapStep, _run_command, SdmxConfig, SampleConfig, RunConfig,
+    SdmxDataflowConfig, SdmxStep)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -497,6 +497,9 @@ def test_run_pipeline_updates_state_and_hash(self) -> None:
 
         # Create dummy input file for sampling
         (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
+        # Create dummy sample and metadata files for schema mapping
+        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
+        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
 
         run_sdmx_pipeline(config=config, now_fn=clock)
 
@@ -537,6 +540,11 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
         # Create dummy input file for sampling (sanitized name)
         (Path(self._tmpdir) /
          "my_flow_name_2025_data.csv").write_text("header\nrow1")
+        # Create dummy sample and metadata files for schema mapping
+        (Path(self._tmpdir) /
+         "my_flow_name_2025_sample.csv").write_text("header\nrow1")
+        (Path(self._tmpdir) /
+         "my_flow_name_2025_metadata.xml").write_text("<xml/>")
         run_sdmx_pipeline(config=config,
                           now_fn=_IncrementingClock(
                               datetime(2025, 1, 3, tzinfo=timezone.utc),
@@ -571,6 +579,9 @@ def test_hash_change_forces_full_rerun(self) -> None:
             datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1))
         # Create dummy input file for sampling
         (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
+        # Create dummy sample and metadata files for schema mapping
+        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
+        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
         run_sdmx_pipeline(config=config, now_fn=first_clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
@@ -603,6 +614,9 @@ def test_hash_unchanged_skips_rerun(self) -> None:
             datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1))
         # Create dummy input file for sampling
         (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
+        # Create dummy sample and metadata files for schema mapping
+        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
+        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
         run_sdmx_pipeline(config=config, now_fn=initial_clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
@@ -843,6 +857,78 @@ def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None:
                                     "Input file missing for sampling"):
             step.dry_run()
 
+    def test_create_schema_map_step_caches_plan(self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+            gemini_cli="custom-gemini",
+            skip_confirmation=True,
+        ),)
+        step = CreateSchemaMapStep(name="test-step", config=config)
+
+        # Create dummy input files to satisfy validation
+        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
+        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
+
+        # First call creates context
+        context1 = step._prepare_command()
+        self.assertIn("pvmap_generator.py", context1.full_command[1])
+        self.assertIn("--gemini_cli=custom-gemini", context1.full_command)
+        self.assertIn("--skip_confirmation", context1.full_command)
+
+        # Second call returns same object
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
+
+    def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = CreateSchemaMapStep(name="test-step", config=config)
+
+        # Create dummy input files
+        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
+        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
+
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                step.dry_run()
+                step.run()
+
+            # Verify dry_run logged the command
+            self.assertTrue(
+                any("test-step (dry run): would run" in entry
+                    for entry in logs.output))
+            self.assertTrue(
+                any("pvmap_generator.py" in entry for entry in logs.output))
+
+            # Verify run called the command with the same args
+            mock_run_cmd.assert_called_once()
+            args, kwargs = mock_run_cmd.call_args
+            self.assertIn("pvmap_generator.py", args[0][1])
+            self.assertTrue(kwargs["verbose"])
+
+    def test_create_schema_map_step_dry_run_fails_if_input_missing(
+            self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = CreateSchemaMapStep(name="test-step", config=config)
+        # No input files created
+
+        with self.assertRaisesRegex(RuntimeError, "Sample file missing"):
+            step.dry_run()
+
 
 if __name__ == "__main__":
     unittest.main()

From 26800adae8455f08cc0404b477000f8d851ff93a Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 08:04:45 +0000
Subject: [PATCH 27/54] feat: implement `ProcessFullDataStep` using
 `stat_var_processor` and introduce distinct sample and final output
 directories.

---
 tools/agentic_import/sdmx_import_pipeline.py  |  70 ++++++++-
 .../sdmx_import_pipeline_test.py              | 140 ++++++++++++++----
 2 files changed, 175 insertions(+), 35 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index f7c7bf5850..972ca419e8 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -38,8 +38,13 @@
 
 SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
 DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py"
+STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" /
+                           "stat_var_processor.py")
 PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py"
 
+SAMPLE_OUTPUT_DIR = Path("sample_output")
+FINAL_OUTPUT_DIR = Path("output")
+
 # Flag names
 _FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
 _FLAG_SDMX_AGENCY = "sdmx.agency"
@@ -502,7 +507,7 @@ def _prepare_command(self) -> _StepContext:
         working_dir = Path(self._config.run.working_dir)
         sample_path = working_dir / f"{dataset_prefix}_sample.csv"
         metadata_path = working_dir / f"{dataset_prefix}_metadata.xml"
-        output_prefix = working_dir / dataset_prefix
+        output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix
 
         if not sample_path.is_file():
             raise RuntimeError(f"Sample file missing: {sample_path}")
@@ -530,6 +535,7 @@ def _prepare_command(self) -> _StepContext:
 
     def run(self) -> None:
         context = self._prepare_command()
+        context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
         logging.info(
             f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}"
         )
@@ -547,15 +553,73 @@ class ProcessFullDataStep(SdmxStep):
 
     VERSION = 1
 
+    RUN_OUTPUT_COLUMNS: ClassVar[str] = (
+        "observationDate,observationAbout,variableMeasured,value,"
+        "observationPeriod,measurementMethod,unit,scalingFactor")
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        input_data_path: Path
+        pv_map_path: Path
+        metadata_path: Path
+        full_command: list[str]
+        output_prefix: Path
+
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: ProcessFullDataStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir)
+        input_data_path = working_dir / f"{dataset_prefix}_data.csv"
+        pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR /
+                       f"{dataset_prefix}_pvmap.csv")
+        metadata_path = (working_dir / SAMPLE_OUTPUT_DIR /
+                         f"{dataset_prefix}_metadata.csv")
+        output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix
+
+        for required in (input_data_path, pv_map_path, metadata_path):
+            if not required.is_file():
+                raise RuntimeError(
+                    f"{self.name} requires existing input: {required}")
+
+        args = [
+            f"--input_data={input_data_path}",
+            f"--pv_map={pv_map_path}",
+            f"--config_file={metadata_path}",
+            "--generate_statvar_name=True",
+            "--skip_constant_csv_columns=False",
+            f"--output_columns={self.RUN_OUTPUT_COLUMNS}",
+            f"--output_path={output_prefix}",
+        ]
+        full_command = [sys.executable, str(STAT_VAR_PROCESSOR_PATH)] + args
+        self._context = ProcessFullDataStep._StepContext(
+            input_data_path=input_data_path,
+            pv_map_path=pv_map_path,
+            metadata_path=metadata_path,
+            full_command=full_command,
+            output_prefix=output_prefix,
+        )
+        return self._context
 
     def run(self) -> None:
+        context = self._prepare_command()
+        # Ensure output directory exists
+        context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
         logging.info(
-            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+            f"Starting stat_var_processor: input={context.input_data_path} "
+            f"pvmap={context.pv_map_path} metadata={context.metadata_path} -> "
+            f"{context.output_prefix}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
-        logging.info(f"{self.name} (dry run): previewing full-data processing")
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
 
 
 class CreateDcConfigStep(SdmxStep):
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 37314e7eac..5a449c4c78 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -47,8 +47,8 @@
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
     run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep,
-    CreateSchemaMapStep, _run_command, SdmxConfig, SampleConfig, RunConfig,
-    SdmxDataflowConfig, SdmxStep)
+    CreateSchemaMapStep, ProcessFullDataStep, _run_command, SdmxConfig,
+    SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -487,19 +487,26 @@ def setUp(self) -> None:
         self._mock_run_command = self._run_command_patcher.start()
         self.addCleanup(self._run_command_patcher.stop)
 
+    def _create_test_input_files(self, prefix: str) -> None:
+        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
+        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
+        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
+
+        sample_output_dir = Path(self._tmpdir) / "sample_output"
+        sample_output_dir.mkdir(parents=True, exist_ok=True)
+        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
+        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
+
     def test_run_pipeline_updates_state_and_hash(self) -> None:
         command = "sdmx run pipeline"
         config = self._build_config(dataset_prefix="demo",
                                     dataflow="df.1",
                                     command=command)
-        clock = _IncrementingClock(datetime(2025, 1, 2, tzinfo=timezone.utc),
-                                   timedelta(seconds=2))
+        clock = _IncrementingClock(datetime(2025, 1, 1, tzinfo=timezone.utc),
+                                   timedelta(seconds=1))
 
-        # Create dummy input file for sampling
-        (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
-        # Create dummy sample and metadata files for schema mapping
-        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
-        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
+        # Create dummy files for ProcessFullDataStep
+        self._create_test_input_files("demo")
 
         run_sdmx_pipeline(config=config, now_fn=clock)
 
@@ -537,17 +544,14 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
         config = self._build_config(dataset_prefix=None,
                                     dataflow=dataflow,
                                     command="sdmx run sanitized")
-        # Create dummy input file for sampling (sanitized name)
-        (Path(self._tmpdir) /
-         "my_flow_name_2025_data.csv").write_text("header\nrow1")
-        # Create dummy sample and metadata files for schema mapping
-        (Path(self._tmpdir) /
-         "my_flow_name_2025_sample.csv").write_text("header\nrow1")
-        (Path(self._tmpdir) /
-         "my_flow_name_2025_metadata.xml").write_text("<xml/>")
+
+        # Create test files for ProcessFullDataStep with sanitized name
+        sanitized_prefix = "my_flow_name_2025"
+        self._create_test_input_files(sanitized_prefix)
+
         run_sdmx_pipeline(config=config,
                           now_fn=_IncrementingClock(
-                              datetime(2025, 1, 3, tzinfo=timezone.utc),
+                              datetime(2025, 1, 2, tzinfo=timezone.utc),
                               timedelta(seconds=2)))
 
         expected_run_id = "my_flow_name_2025"
@@ -576,12 +580,12 @@ def test_hash_change_forces_full_rerun(self) -> None:
                                     dataflow="df.2",
                                     command="sdmx rerun force")
         first_clock = _IncrementingClock(
-            datetime(2025, 1, 4, tzinfo=timezone.utc), timedelta(seconds=1))
-        # Create dummy input file for sampling
-        (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
-        # Create dummy sample and metadata files for schema mapping
-        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
-        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
+            datetime(2025, 1, 2, tzinfo=timezone.utc), timedelta(seconds=1))
+
+        # Create dummy files for ProcessFullDataStep
+        self._create_test_input_files("demo")
+
+        # Run 1 with original config
         run_sdmx_pipeline(config=config, now_fn=first_clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
@@ -594,7 +598,7 @@ def test_hash_change_forces_full_rerun(self) -> None:
                                            dataflow=updated_dataflow)
         updated_config = dataclasses.replace(config, sdmx=updated_sdmx)
         second_clock = _IncrementingClock(
-            datetime(2025, 1, 5, tzinfo=timezone.utc), timedelta(seconds=1))
+            datetime(2025, 1, 3, tzinfo=timezone.utc), timedelta(seconds=1))
         run_sdmx_pipeline(config=updated_config, now_fn=second_clock)
 
         with state_path.open(encoding="utf-8") as fp:
@@ -611,12 +615,12 @@ def test_hash_unchanged_skips_rerun(self) -> None:
                                     dataflow="df.3",
                                     command="sdmx rerun noop")
         initial_clock = _IncrementingClock(
-            datetime(2025, 1, 6, tzinfo=timezone.utc), timedelta(seconds=1))
-        # Create dummy input file for sampling
-        (Path(self._tmpdir) / "demo_data.csv").write_text("header\nrow1")
-        # Create dummy sample and metadata files for schema mapping
-        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
-        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
+            datetime(2025, 1, 3, tzinfo=timezone.utc), timedelta(seconds=1))
+
+        # Create dummy files for ProcessFullDataStep
+        self._create_test_input_files("demo")
+
+        # Run 1
         run_sdmx_pipeline(config=config, now_fn=initial_clock)
 
         state_path = Path(self._tmpdir) / ".datacommons" / "demo.state.json"
@@ -640,6 +644,16 @@ def setUp(self) -> None:
         self.addCleanup(self._tmpdir_obj.cleanup)
         self._tmpdir = self._tmpdir_obj.name
 
+    def _create_test_input_files(self, prefix: str) -> None:
+        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
+        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
+        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
+
+        sample_output_dir = Path(self._tmpdir) / "sample_output"
+        sample_output_dir.mkdir(parents=True, exist_ok=True)
+        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
+        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
+
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
             with self.assertLogs(logging.get_absl_logger(),
@@ -925,8 +939,70 @@ def test_create_schema_map_step_dry_run_fails_if_input_missing(
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
         # No input files created
+        with self.assertRaises(RuntimeError):
+            step.dry_run()
+
+    def test_process_full_data_step_caches_plan(self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = ProcessFullDataStep(name="test-step", config=config)
 
-        with self.assertRaisesRegex(RuntimeError, "Sample file missing"):
+        # Create test files to satisfy validation
+        self._create_test_input_files("demo")
+
+        context1 = step._prepare_command()
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
+
+    def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = ProcessFullDataStep(name="test-step", config=config)
+
+        # Create test files
+        self._create_test_input_files("demo")
+
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                step.dry_run()
+                step.run()
+
+            # Verify dry_run logged the command
+            self.assertTrue(
+                any("test-step (dry run): would run" in entry
+                    for entry in logs.output))
+            self.assertTrue(
+                any("stat_var_processor.py" in entry for entry in logs.output))
+
+            # Verify run called the command with the same args
+            mock_run_cmd.assert_called_once()
+            args, kwargs = mock_run_cmd.call_args
+            self.assertIn("stat_var_processor.py", args[0][1])
+            self.assertIn("--input_data=", args[0][2])
+            self.assertTrue(kwargs["verbose"])
+
+    def test_process_full_data_step_run_fails_if_input_missing(self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = ProcessFullDataStep(name="test-step", config=config)
+        # Missing input files
+        with self.assertRaises(RuntimeError):
+            step.run()
+        with self.assertRaises(RuntimeError):
             step.dry_run()
 
 

From 55360a9eb7bc1514c8fddfbfaef54c259e8fa1b6 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 08:21:36 +0000
Subject: [PATCH 28/54] feat: Implement `CreateDcConfigStep` to generate custom
 DC configurations.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 57 +++++++++++-
 .../sdmx_import_pipeline_test.py              | 87 ++++++++++++++++++-
 2 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 972ca419e8..0b4b18f64a 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -41,6 +41,8 @@
 STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" /
                            "stat_var_processor.py")
 PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py"
+DC_CONFIG_GENERATOR_PATH = (REPO_ROOT / "tools" / "agentic_import" /
+                            "generate_custom_dc_config.py")
 
 SAMPLE_OUTPUT_DIR = Path("sample_output")
 FINAL_OUTPUT_DIR = Path("output")
@@ -627,15 +629,66 @@ class CreateDcConfigStep(SdmxStep):
 
     VERSION = 1
 
+    @dataclass(frozen=True)
+    class _StepContext:
+        input_csv: Path
+        output_config: Path
+        full_command: list[str]
+
     def __init__(self, *, name: str, config: PipelineConfig) -> None:
         super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: CreateDcConfigStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir)
+        input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv"
+        output_config = (working_dir / FINAL_OUTPUT_DIR /
+                         f"{dataset_prefix}_config.json")
+
+        endpoint = _require_config_field(self._config.sdmx.endpoint,
+                                         _FLAG_SDMX_ENDPOINT, self.name)
+        agency = _require_config_field(self._config.sdmx.agency,
+                                       _FLAG_SDMX_AGENCY, self.name)
+        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
+                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+
+        dataset_url = (f"{endpoint.rstrip('/')}/data/"
+                       f"{agency},{dataflow},")
+
+        args = [
+            f"--input_csv={input_csv}",
+            f"--output_config={output_config}",
+            f"--provenance_name={dataflow}",
+            f"--source_name={agency}",
+            f"--data_source_url={endpoint}",
+            f"--dataset_url={dataset_url}",
+        ]
+        full_command = [sys.executable, str(DC_CONFIG_GENERATOR_PATH)] + args
+        self._context = CreateDcConfigStep._StepContext(
+            input_csv=input_csv,
+            output_config=output_config,
+            full_command=full_command)
+        return self._context
 
     def run(self) -> None:
+        context = self._prepare_command()
+        if not context.input_csv.is_file():
+            raise RuntimeError(
+                f"{self.name} requires existing input: {context.input_csv}")
+
         logging.info(
-            f"{self.name}: no-op implementation for VERSION={self.VERSION}")
+            f"Starting custom DC config generation: input={context.input_csv} -> {context.output_config}"
+        )
+        _run_command(context.full_command, verbose=self._config.run.verbose)
 
     def dry_run(self) -> None:
-        logging.info(f"{self.name} (dry run): previewing DC config creation")
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
 
 
 class PipelineBuilder:
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 5a449c4c78..d3f14d0ced 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -47,8 +47,8 @@
     InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
     StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
     run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep,
-    CreateSchemaMapStep, ProcessFullDataStep, _run_command, SdmxConfig,
-    SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep)
+    CreateSchemaMapStep, ProcessFullDataStep, CreateDcConfigStep, _run_command,
+    SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -507,6 +507,10 @@ def test_run_pipeline_updates_state_and_hash(self) -> None:
 
         # Create dummy files for ProcessFullDataStep
         self._create_test_input_files("demo")
+        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / "demo.csv").write_text("data")
 
         run_sdmx_pipeline(config=config, now_fn=clock)
 
@@ -548,6 +552,10 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
         # Create test files for ProcessFullDataStep with sanitized name
         sanitized_prefix = "my_flow_name_2025"
         self._create_test_input_files(sanitized_prefix)
+        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / f"{sanitized_prefix}.csv").write_text("data")
 
         run_sdmx_pipeline(config=config,
                           now_fn=_IncrementingClock(
@@ -584,6 +592,10 @@ def test_hash_change_forces_full_rerun(self) -> None:
 
         # Create dummy files for ProcessFullDataStep
         self._create_test_input_files("demo")
+        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / "demo.csv").write_text("data")
 
         # Run 1 with original config
         run_sdmx_pipeline(config=config, now_fn=first_clock)
@@ -619,6 +631,10 @@ def test_hash_unchanged_skips_rerun(self) -> None:
 
         # Create dummy files for ProcessFullDataStep
         self._create_test_input_files("demo")
+        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / "demo.csv").write_text("data")
 
         # Run 1
         run_sdmx_pipeline(config=config, now_fn=initial_clock)
@@ -654,6 +670,29 @@ def _create_test_input_files(self, prefix: str) -> None:
         (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
         (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
 
+    def _build_config(self,
+                      dataset_prefix: str | None,
+                      endpoint: str = "https://example.com",
+                      agency: str = "AGENCY",
+                      dataflow: str = "FLOW") -> PipelineConfig:
+        return PipelineConfig(sdmx=SdmxConfig(
+            endpoint=endpoint,
+            agency=agency,
+            dataflow=SdmxDataflowConfig(id=dataflow)),
+                              run=RunConfig(command="test",
+                                            dataset_prefix=dataset_prefix,
+                                            working_dir=self._tmpdir))
+
+    def _create_test_input_files(self, prefix: str) -> None:
+        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
+        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
+        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
+
+        sample_output_dir = Path(self._tmpdir) / "sample_output"
+        sample_output_dir.mkdir(parents=True, exist_ok=True)
+        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
+        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
+
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
             with self.assertLogs(logging.get_absl_logger(),
@@ -1005,6 +1044,50 @@ def test_process_full_data_step_run_fails_if_input_missing(self) -> None:
         with self.assertRaises(RuntimeError):
             step.dry_run()
 
+    def test_create_dc_config_step_caches_plan(self) -> None:
+        config = self._build_config(dataset_prefix="demo",
+                                    endpoint="https://example.com",
+                                    agency="AGENCY",
+                                    dataflow="FLOW")
+        step = CreateDcConfigStep(name="test-step", config=config)
+        context1 = step._prepare_command()
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
+
+    def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None:
+        config = self._build_config(dataset_prefix="demo",
+                                    endpoint="https://example.com",
+                                    agency="AGENCY",
+                                    dataflow="FLOW")
+        step = CreateDcConfigStep(name="test-step", config=config)
+
+        # Create test files
+        self._create_test_input_files("demo")
+        # Create final output dir and input csv
+        final_output_dir = Path(self._tmpdir) / "output"
+        final_output_dir.mkdir(parents=True, exist_ok=True)
+        (final_output_dir / "demo.csv").write_text("data")
+
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            step.run()
+            mock_run_cmd.assert_called_once()
+            args, kwargs = mock_run_cmd.call_args
+            command = args[0]
+            self.assertIn("generate_custom_dc_config.py", command[1])
+            self.assertIn(f"--input_csv={final_output_dir}/demo.csv", command)
+            self.assertIn(
+                f"--output_config={final_output_dir}/demo_config.json", command)
+            self.assertIn("--provenance_name=FLOW", command)
+            self.assertIn("--source_name=AGENCY", command)
+            self.assertIn("--data_source_url=https://example.com", command)
+            self.assertIn("--dataset_url=https://example.com/data/AGENCY,FLOW,",
+                          command)
+
+        with self.assertLogs(logging.get_absl_logger(), level="INFO") as cm:
+            step.dry_run()
+            self.assertTrue(any("would run" in msg for msg in cm.output))
+
 
 if __name__ == "__main__":
     unittest.main()

From 1926be1dc6961565d49a8711afe3117104a9a0c4 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 08:39:22 +0000
Subject: [PATCH 29/54] refactor: Rename dummy config to `_TEST_CONFIG` and
 centralize dummy output file creation in tests.

---
 .../sdmx_import_pipeline_test.py              | 56 +++++++------------
 1 file changed, 19 insertions(+), 37 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index d3f14d0ced..2c497197d6 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -52,7 +52,7 @@
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
-_DUMMY_CONFIG = PipelineConfig(run=RunConfig(command="test"))
+_TEST_CONFIG = PipelineConfig(run=RunConfig(command="test"))
 
 
 class _IncrementingClock:
@@ -73,7 +73,7 @@ def __call__(self) -> datetime:
 class _RecordingStep(SdmxStep):
 
     def __init__(self, name: str, *, should_fail: bool = False) -> None:
-        super().__init__(name=name, version=1, config=_DUMMY_CONFIG)
+        super().__init__(name=name, version=1, config=_TEST_CONFIG)
         self._should_fail = should_fail
 
     def run(self) -> None:
@@ -87,7 +87,7 @@ def dry_run(self) -> None:
 class _VersionedStep(SdmxStep):
 
     def __init__(self, name: str, version: int) -> None:
-        super().__init__(name=name, version=version, config=_DUMMY_CONFIG)
+        super().__init__(name=name, version=version, config=_TEST_CONFIG)
 
     def run(self) -> None:
         logging.info("noop")
@@ -204,7 +204,7 @@ class _AbortStep(SdmxStep):
                 def __init__(self) -> None:
                     super().__init__(name="download.download-data",
                                      version=1,
-                                     config=_DUMMY_CONFIG)
+                                     config=_TEST_CONFIG)
 
                 def run(self) -> None:
                     raise PipelineAbort("user requested stop")
@@ -497,6 +497,10 @@ def _create_test_input_files(self, prefix: str) -> None:
         (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
         (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
 
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / f"{prefix}.csv").write_text("output")
+
     def test_run_pipeline_updates_state_and_hash(self) -> None:
         command = "sdmx run pipeline"
         config = self._build_config(dataset_prefix="demo",
@@ -505,12 +509,8 @@ def test_run_pipeline_updates_state_and_hash(self) -> None:
         clock = _IncrementingClock(datetime(2025, 1, 1, tzinfo=timezone.utc),
                                    timedelta(seconds=1))
 
-        # Create dummy files for ProcessFullDataStep
+        # Create test files for ProcessFullDataStep
         self._create_test_input_files("demo")
-        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
-        output_dir = Path(self._tmpdir) / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / "demo.csv").write_text("data")
 
         run_sdmx_pipeline(config=config, now_fn=clock)
 
@@ -552,10 +552,6 @@ def test_run_id_sanitizes_dataflow_when_prefix_missing(self) -> None:
         # Create test files for ProcessFullDataStep with sanitized name
         sanitized_prefix = "my_flow_name_2025"
         self._create_test_input_files(sanitized_prefix)
-        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
-        output_dir = Path(self._tmpdir) / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / f"{sanitized_prefix}.csv").write_text("data")
 
         run_sdmx_pipeline(config=config,
                           now_fn=_IncrementingClock(
@@ -590,12 +586,8 @@ def test_hash_change_forces_full_rerun(self) -> None:
         first_clock = _IncrementingClock(
             datetime(2025, 1, 2, tzinfo=timezone.utc), timedelta(seconds=1))
 
-        # Create dummy files for ProcessFullDataStep
+        # Create test files for ProcessFullDataStep
         self._create_test_input_files("demo")
-        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
-        output_dir = Path(self._tmpdir) / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / "demo.csv").write_text("data")
 
         # Run 1 with original config
         run_sdmx_pipeline(config=config, now_fn=first_clock)
@@ -629,12 +621,8 @@ def test_hash_unchanged_skips_rerun(self) -> None:
         initial_clock = _IncrementingClock(
             datetime(2025, 1, 3, tzinfo=timezone.utc), timedelta(seconds=1))
 
-        # Create dummy files for ProcessFullDataStep
+        # Create test files for ProcessFullDataStep
         self._create_test_input_files("demo")
-        # Create dummy output for ProcessFullDataStep to satisfy CreateDcConfigStep
-        output_dir = Path(self._tmpdir) / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / "demo.csv").write_text("data")
 
         # Run 1
         run_sdmx_pipeline(config=config, now_fn=initial_clock)
@@ -670,6 +658,10 @@ def _create_test_input_files(self, prefix: str) -> None:
         (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
         (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
 
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / f"{prefix}.csv").write_text("output")
+
     def _build_config(self,
                       dataset_prefix: str | None,
                       endpoint: str = "https://example.com",
@@ -683,16 +675,6 @@ def _build_config(self,
                                             dataset_prefix=dataset_prefix,
                                             working_dir=self._tmpdir))
 
-    def _create_test_input_files(self, prefix: str) -> None:
-        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
-        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
-        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
-
-        sample_output_dir = Path(self._tmpdir) / "sample_output"
-        sample_output_dir.mkdir(parents=True, exist_ok=True)
-        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
-        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
-
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
             with self.assertLogs(logging.get_absl_logger(),
@@ -844,7 +826,7 @@ def test_create_sample_step_caches_plan(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
 
-        # Create dummy input file to satisfy validation
+        # Create test input file to satisfy validation
         input_path = Path(self._tmpdir) / "demo_data.csv"
         input_path.write_text("header\nrow1")
 
@@ -869,7 +851,7 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
 
-        # Create dummy input file
+        # Create test input file
         input_path = Path(self._tmpdir) / "demo_data.csv"
         input_path.write_text("header\nrow1")
 
@@ -921,7 +903,7 @@ def test_create_schema_map_step_caches_plan(self) -> None:
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
 
-        # Create dummy input files to satisfy validation
+        # Create test input files to satisfy validation
         (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
         (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
 
@@ -944,7 +926,7 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
 
-        # Create dummy input files
+        # Create test input files
         (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
         (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
 

From 503ea7d662a22779eee01b77ac86ad99926e9ad1 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 08:45:39 +0000
Subject: [PATCH 30/54] refactor: replace global flags with structured
 dataclass-based configuration objects and related utility functions.

---
 tools/agentic_import/sdmx_import_pipeline.py | 304 +++++++++----------
 1 file changed, 152 insertions(+), 152 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 0b4b18f64a..a0ee8c2e36 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -64,6 +64,69 @@
 FLAGS = flags.FLAGS
 
 
+@dataclass(frozen=True)
+class SdmxDataflowConfig:
+    """Configuration for SDMX dataflow."""
+    id: str | None = None
+    key: str | None = None
+    param: str | None = None
+
+
+@dataclass(frozen=True)
+class SdmxConfig:
+    """Configuration for SDMX data access."""
+    endpoint: str | None = None
+    agency: str | None = None
+    dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig)
+
+
+@dataclass(frozen=True)
+class SampleConfig:
+    """Configuration for data sampling."""
+    rows: int = 1000
+
+
+@dataclass(frozen=True)
+class RunConfig:
+    """Configuration for pipeline execution."""
+    command: str
+    dataset_prefix: str | None = None
+    working_dir: str | None = None
+    run_only: str | None = None
+    force: bool = False
+    verbose: bool = False
+    skip_confirmation: bool = False
+    gemini_cli: str | None = None
+
+
+@dataclass(frozen=True)
+class PipelineConfig:
+    """Aggregated configuration for the pipeline."""
+    sdmx: SdmxConfig = field(default_factory=SdmxConfig)
+    sample: SampleConfig = field(default_factory=SampleConfig)
+    run: RunConfig = field(default_factory=lambda: RunConfig(command="python"))
+
+
+@dataclass(frozen=True)
+class StepDecision:
+    """Represents whether a step will run and why."""
+
+    RUN: ClassVar[str] = "RUN"
+    SKIP: ClassVar[str] = "SKIP"
+
+    step_name: str
+    decision: str
+    reason: str
+
+
+@dataclass(frozen=True)
+class BuildResult:
+    """Output of planning that includes the pipeline and per-step decisions."""
+
+    pipeline: Pipeline
+    decisions: list[StepDecision]
+
+
 def _require_config_field(value: str | None, field: str, step_name: str) -> str:
     if value:
         return value
@@ -81,50 +144,62 @@ def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
     _run_command(command, verbose=verbose)
 
 
-def _define_flags() -> None:
-    flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
-    flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT)
-
-    flags.DEFINE_string(_FLAG_SDMX_AGENCY, None,
-                        "Owning SDMX agency identifier.")
-    flags.mark_flag_as_required(_FLAG_SDMX_AGENCY)
-
-    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None,
-                        "Target SDMX dataflow identifier.")
-    flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID)
-
-    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None,
-                        "Optional SDMX key or filter.")
+def _format_time(value: datetime) -> str:
+    if value.tzinfo is None:
+        value = value.replace(tzinfo=timezone.utc)
+    return value.isoformat()
 
-    flags.DEFINE_string(
-        _FLAG_SDMX_DATAFLOW_PARAM, None,
-        "Optional SDMX parameter appended to the dataflow query.")
 
-    flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000,
-                         "Number of rows to sample from downloaded data.")
+def _sanitize_run_id(dataflow: str) -> str:
+    normalized = dataflow.lower()
+    normalized = re.sub(r"[^a-z0-9_]+", "_", normalized)
+    normalized = re.sub(r"_+", "_", normalized)
+    return normalized.strip("_")
 
-    flags.DEFINE_string(
-        "dataset_prefix", None,
-        "Optional dataset prefix to override auto-derived values.")
 
-    flags.DEFINE_string("run_only", None,
-                        "Execute only a specific pipeline step by name.")
+def _resolve_dataset_prefix(config: PipelineConfig) -> str:
+    if config.run.dataset_prefix:
+        return config.run.dataset_prefix
+    if not config.sdmx.dataflow.id:
+        raise ValueError(
+            "dataflow.id or dataset_prefix is required to derive dataset prefix"
+        )
+    sanitized = _sanitize_run_id(config.sdmx.dataflow.id)
+    if not sanitized:
+        raise ValueError("dataflow value is invalid after sanitization")
+    return sanitized
 
-    flags.DEFINE_boolean("force", False, "Force all steps to run.")
 
-    flags.DEFINE_boolean("verbose", False, "Enable verbose logging.")
+def _compute_critical_input_hash(config: PipelineConfig) -> str:
+    payload = {
+        _FLAG_SDMX_AGENCY: config.sdmx.agency,
+        _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id,
+        _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint,
+        _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key,
+        _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param,
+    }
+    serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"))
+    return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
 
-    flags.DEFINE_boolean("skip_confirmation", False,
-                         "Skip interactive confirmation prompts.")
 
-    flags.DEFINE_string("gemini_cli", "gemini",
-                        "Path to Gemini CLI executable.")
+def _resolve_working_dir(config: PipelineConfig) -> Path:
+    directory = Path(config.run.working_dir or os.getcwd())
+    if directory.exists():
+        if not directory.is_dir():
+            raise ValueError(f"working_dir is not a directory: {directory}")
+    else:
+        directory.mkdir(parents=True, exist_ok=True)
+    return directory
 
 
-def _format_time(value: datetime) -> str:
-    if value.tzinfo is None:
-        value = value.replace(tzinfo=timezone.utc)
-    return value.isoformat()
+def _resolve_config(config: PipelineConfig) -> PipelineConfig:
+    """Resolves dynamic configuration values and returns a new config."""
+    dataset_prefix = _resolve_dataset_prefix(config)
+    working_dir = _resolve_working_dir(config)
+    new_run = dataclasses.replace(config.run,
+                                  dataset_prefix=dataset_prefix,
+                                  working_dir=str(working_dir))
+    return dataclasses.replace(config, run=new_run)
 
 
 class InteractiveCallback(PipelineCallback):
@@ -227,69 +302,6 @@ def build_pipeline_callback(
     return CompositeCallback([interactive, json_callback])
 
 
-@dataclass(frozen=True)
-class SdmxDataflowConfig:
-    """Configuration for SDMX dataflow."""
-    id: str | None = None
-    key: str | None = None
-    param: str | None = None
-
-
-@dataclass(frozen=True)
-class SdmxConfig:
-    """Configuration for SDMX data access."""
-    endpoint: str | None = None
-    agency: str | None = None
-    dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig)
-
-
-@dataclass(frozen=True)
-class SampleConfig:
-    """Configuration for data sampling."""
-    rows: int = 1000
-
-
-@dataclass(frozen=True)
-class RunConfig:
-    """Configuration for pipeline execution."""
-    command: str
-    dataset_prefix: str | None = None
-    working_dir: str | None = None
-    run_only: str | None = None
-    force: bool = False
-    verbose: bool = False
-    skip_confirmation: bool = False
-    gemini_cli: str | None = None
-
-
-@dataclass(frozen=True)
-class PipelineConfig:
-    """Aggregated configuration for the pipeline."""
-    sdmx: SdmxConfig = field(default_factory=SdmxConfig)
-    sample: SampleConfig = field(default_factory=SampleConfig)
-    run: RunConfig = field(default_factory=lambda: RunConfig(command="python"))
-
-
-@dataclass(frozen=True)
-class StepDecision:
-    """Represents whether a step will run and why."""
-
-    RUN: ClassVar[str] = "RUN"
-    SKIP: ClassVar[str] = "SKIP"
-
-    step_name: str
-    decision: str
-    reason: str
-
-
-@dataclass(frozen=True)
-class BuildResult:
-    """Output of planning that includes the pipeline and per-step decisions."""
-
-    pipeline: Pipeline
-    decisions: list[StepDecision]
-
-
 class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
@@ -866,66 +878,14 @@ def build_sdmx_pipeline(*,
                         critical_input_hash: str | None = None) -> Pipeline:
     builder_steps = steps if steps is not None else build_steps(config)
     builder = PipelineBuilder(config=config,
-                              state=state,
-                              steps=builder_steps,
-                              critical_input_hash=critical_input_hash)
+                               state=state,
+                               steps=builder_steps,
+                               critical_input_hash=critical_input_hash)
     result = builder.build()
     _log_step_decisions(result.decisions)
     return result.pipeline
 
 
-def _sanitize_run_id(dataflow: str) -> str:
-    normalized = dataflow.lower()
-    normalized = re.sub(r"[^a-z0-9_]+", "_", normalized)
-    normalized = re.sub(r"_+", "_", normalized)
-    return normalized.strip("_")
-
-
-def _resolve_dataset_prefix(config: PipelineConfig) -> str:
-    if config.run.dataset_prefix:
-        return config.run.dataset_prefix
-    if not config.sdmx.dataflow.id:
-        raise ValueError(
-            "dataflow.id or dataset_prefix is required to derive dataset prefix"
-        )
-    sanitized = _sanitize_run_id(config.sdmx.dataflow.id)
-    if not sanitized:
-        raise ValueError("dataflow value is invalid after sanitization")
-    return sanitized
-
-
-def _compute_critical_input_hash(config: PipelineConfig) -> str:
-    payload = {
-        _FLAG_SDMX_AGENCY: config.sdmx.agency,
-        _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id,
-        _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint,
-        _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key,
-        _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param,
-    }
-    serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"))
-    return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
-
-
-def _resolve_working_dir(config: PipelineConfig) -> Path:
-    directory = Path(config.run.working_dir or os.getcwd())
-    if directory.exists():
-        if not directory.is_dir():
-            raise ValueError(f"working_dir is not a directory: {directory}")
-    else:
-        directory.mkdir(parents=True, exist_ok=True)
-    return directory
-
-
-def _resolve_config(config: PipelineConfig) -> PipelineConfig:
-    """Resolves dynamic configuration values and returns a new config."""
-    dataset_prefix = _resolve_dataset_prefix(config)
-    working_dir = _resolve_working_dir(config)
-    new_run = dataclasses.replace(config.run,
-                                  dataset_prefix=dataset_prefix,
-                                  working_dir=str(working_dir))
-    return dataclasses.replace(config, run=new_run)
-
-
 def run_sdmx_pipeline(
     *,
     config: PipelineConfig,
@@ -990,6 +950,46 @@ def prepare_config() -> PipelineConfig:
     )
 
 
+def _define_flags() -> None:
+    flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
+    flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT)
+
+    flags.DEFINE_string(_FLAG_SDMX_AGENCY, None,
+                        "Owning SDMX agency identifier.")
+    flags.mark_flag_as_required(_FLAG_SDMX_AGENCY)
+
+    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None,
+                        "Target SDMX dataflow identifier.")
+    flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID)
+
+    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None,
+                        "Optional SDMX key or filter.")
+
+    flags.DEFINE_string(
+        _FLAG_SDMX_DATAFLOW_PARAM, None,
+        "Optional SDMX parameter appended to the dataflow query.")
+
+    flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000,
+                         "Number of rows to sample from downloaded data.")
+
+    flags.DEFINE_string(
+        "dataset_prefix", None,
+        "Optional dataset prefix to override auto-derived values.")
+
+    flags.DEFINE_string("run_only", None,
+                        "Execute only a specific pipeline step by name.")
+
+    flags.DEFINE_boolean("force", False, "Force all steps to run.")
+
+    flags.DEFINE_boolean("verbose", False, "Enable verbose logging.")
+
+    flags.DEFINE_boolean("skip_confirmation", False,
+                         "Skip interactive confirmation prompts.")
+
+    flags.DEFINE_string("gemini_cli", "gemini",
+                        "Path to Gemini CLI executable.")
+
+
 def main(_: list[str]) -> int:
     config = prepare_config()
     logging.info(f"SDMX pipeline configuration: {config}")

From 58f5d517219ca6d00c803188f1665697ac68fddb Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 08:48:15 +0000
Subject: [PATCH 31/54] feat: Reorder imports and adjust `PipelineBuilder`
 parameter indentation

---
 tools/agentic_import/sdmx_import_pipeline.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index a0ee8c2e36..2510a755e1 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -36,6 +36,12 @@
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 
+from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
+                                           PipelineAbort, PipelineCallback,
+                                           PipelineRunner, RunnerConfig, Step)
+from tools.agentic_import.state_handler import (PipelineState, StateHandler,
+                                                StepState)
+
 SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
 DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py"
 STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" /
@@ -55,12 +61,6 @@
 _FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param"
 _FLAG_SAMPLE_ROWS = "sample.rows"
 
-from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
-                                           PipelineAbort, PipelineCallback,
-                                           PipelineRunner, RunnerConfig, Step)
-from tools.agentic_import.state_handler import (PipelineState, StateHandler,
-                                                StepState)
-
 FLAGS = flags.FLAGS
 
 
@@ -878,9 +878,9 @@ def build_sdmx_pipeline(*,
                         critical_input_hash: str | None = None) -> Pipeline:
     builder_steps = steps if steps is not None else build_steps(config)
     builder = PipelineBuilder(config=config,
-                               state=state,
-                               steps=builder_steps,
-                               critical_input_hash=critical_input_hash)
+                              state=state,
+                              steps=builder_steps,
+                              critical_input_hash=critical_input_hash)
     result = builder.build()
     _log_step_decisions(result.decisions)
     return result.pipeline

From 6cc6172e3a84e58fedcfed7a23f33f612639e0e0 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 10:25:47 +0000
Subject: [PATCH 32/54] feat: Add SDMX agentic import pipeline with new
 documentation, code, and tests.

---
 tools/agentic_import/sdmx_import_pipeline.md | 89 ++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 tools/agentic_import/sdmx_import_pipeline.md

diff --git a/tools/agentic_import/sdmx_import_pipeline.md b/tools/agentic_import/sdmx_import_pipeline.md
new file mode 100644
index 0000000000..050f2e60aa
--- /dev/null
+++ b/tools/agentic_import/sdmx_import_pipeline.md
@@ -0,0 +1,89 @@
+# SDMX Agentic Import Pipeline
+
+The SDMX Agentic Import Pipeline is a Python-based system designed to automate the retrieval and processing of SDMX (Statistical Data and Metadata eXchange) data for Data Commons. It provides a structured, step-based approach to downloading, sampling, mapping, and processing SDMX data into Data Commons artifacts.
+
+## Overview
+
+The pipeline orchestrates several tools to handle the end-to-end import process:
+1.  **Download**: Retrieves data and metadata from SDMX endpoints.
+2.  **Sample**: Creates a manageable sample of the data for analysis.
+3.  **Map**: Generates Property-Value (PV) mappings using LLM-based tools.
+4.  **Process**: Converts the full dataset into Data Commons MCF and CSV formats.
+5.  **Config**: Generates configuration for custom Data Commons instances.
+
+## Prerequisites
+
+Before running the pipeline, ensure you have:
+1.  **Python Environment**: Set up as described in the [main README](./README.md#step-2-environment-setup).
+2.  **Gemini CLI**: Installed and configured for schema mapping.
+3.  **Data Commons API Key**: Set in your environment.
+
+## Usage
+
+The pipeline is executed using the `sdmx_import_pipeline.py` script.
+
+### Basic Command
+
+```bash
+python tools/agentic_import/sdmx_import_pipeline.py \
+  --sdmx.endpoint="https://sdmx.example.org/data" \
+  --sdmx.agency="AGENCY_ID" \
+  --sdmx.dataflow.id="DATAFLOW_ID" \
+  --working_dir="/path/to/working/dir"
+```
+
+### Key Flags
+
+-   `--sdmx.endpoint`: The SDMX API endpoint URL.
+-   `--sdmx.agency`: The SDMX agency ID.
+-   `--sdmx.dataflow.id`: The SDMX dataflow ID.
+-   `--sdmx.dataflow.key`: (Optional) Filter key for data download.
+-   `--sdmx.dataflow.param`: (Optional) Additional parameters for data download.
+-   `--working_dir`: Directory for input and output files.
+-   `--sample.rows`: Number of rows for the sample dataset (default: 1000).
+-   `--force`: Force re-execution of all steps, ignoring saved state.
+-   `--verbose`: Enable verbose logging.
+
+## Pipeline Steps
+
+The pipeline consists of the following steps, executed in order:
+
+1.  **DownloadDataStep**: Downloads SDMX data to `<dataset_prefix>_data.csv`.
+2.  **DownloadMetadataStep**: Downloads SDMX metadata to `<dataset_prefix>_metadata.xml`.
+3.  **CreateSampleStep**: Creates `<dataset_prefix>_sample.csv` from the downloaded data.
+4.  **CreateSchemaMapStep**: Generates PV map and config in `sample_output/` using `pvmap_generator.py`.
+5.  **ProcessFullDataStep**: Processes the full data using `stat_var_processor.py` to generate artifacts in `output/`.
+6.  **CreateDcConfigStep**: Generates `output/<dataset_prefix>_config.json` for custom DC imports.
+
+## Directory Structure
+
+The pipeline organizes outputs within the specified `--working_dir`:
+
+```
+working_dir/
+├── <dataset_prefix>_data.csv          # Raw downloaded data
+├── <dataset_prefix>_metadata.xml      # Raw downloaded metadata
+├── <dataset_prefix>_sample.csv        # Sampled data
+├── .state.json                        # Pipeline state for resuming runs
+├── sample_output/                     # Intermediate artifacts from mapping
+│   ├── <dataset_prefix>_pvmap.csv
+│   └── <dataset_prefix>_metadata.csv
+└── output/                            # Final Data Commons artifacts
+    ├── <dataset_prefix>.csv
+    ├── <dataset_prefix>.mcf
+    ├── <dataset_prefix>.tmcf
+    └── <dataset_prefix>_config.json
+```
+
+## State Management
+
+The pipeline automatically saves its state to a `.state.json` file in the working directory.
+-   **Resuming**: If a run is interrupted, running the same command again will resume from the last successful step.
+-   **Skipping**: Steps that have already completed successfully will be skipped unless `--force` is used.
+-   **Input Hashing**: The pipeline tracks input configuration. If critical configuration changes, it may trigger re-execution of steps.
+
+## Troubleshooting
+
+-   **Gemini CLI Errors**: If the schema mapping step fails, check the Gemini CLI logs (usually in `.datacommons/runs/` within the working directory).
+-   **Missing Data**: Ensure the SDMX endpoint, agency, and dataflow ID are correct. Use `--verbose` to see the exact commands being run.
+-   **State Issues**: If the pipeline is stuck or behaving unexpectedly, you can delete `.state.json` to reset the state, or use `--force`.

From 8e4e7a7db8f60accee9d6e227b01a8985ef02bc6 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 26 Nov 2025 10:42:28 +0000
Subject: [PATCH 33/54] fix: Move input file existence checks to `run` methods,
 allowing dry runs to succeed without pre-existing files and updating related
 tests.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 26 +++----
 .../sdmx_import_pipeline_test.py              | 76 ++++++++++++++-----
 2 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 2510a755e1..1a6b81dd71 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -467,9 +467,6 @@ def _prepare_command(self) -> _StepContext:
         input_path = working_dir / f"{dataset_prefix}_data.csv"
         output_path = working_dir / f"{dataset_prefix}_sample.csv"
 
-        if not input_path.is_file():
-            raise RuntimeError(f"Input file missing for sampling: {input_path}")
-
         args = [
             f"--sampler_input={input_path}",
             f"--sampler_output={output_path}",
@@ -483,6 +480,9 @@ def _prepare_command(self) -> _StepContext:
 
     def run(self) -> None:
         context = self._prepare_command()
+        if not context.input_path.is_file():
+            raise RuntimeError(
+                f"Input file missing for sampling: {context.input_path}")
         if self._config.run.verbose:
             logging.info(
                 f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}"
@@ -523,11 +523,6 @@ def _prepare_command(self) -> _StepContext:
         metadata_path = working_dir / f"{dataset_prefix}_metadata.xml"
         output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix
 
-        if not sample_path.is_file():
-            raise RuntimeError(f"Sample file missing: {sample_path}")
-        if not metadata_path.is_file():
-            raise RuntimeError(f"Metadata file missing: {metadata_path}")
-
         args = [
             f"--input_data={sample_path}",
             f"--input_metadata={metadata_path}",
@@ -549,6 +544,11 @@ def _prepare_command(self) -> _StepContext:
 
     def run(self) -> None:
         context = self._prepare_command()
+        if not context.sample_path.is_file():
+            raise RuntimeError(f"Sample file missing: {context.sample_path}")
+        if not context.metadata_path.is_file():
+            raise RuntimeError(
+                f"Metadata file missing: {context.metadata_path}")
         context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
         logging.info(
             f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}"
@@ -595,11 +595,6 @@ def _prepare_command(self) -> _StepContext:
                          f"{dataset_prefix}_metadata.csv")
         output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix
 
-        for required in (input_data_path, pv_map_path, metadata_path):
-            if not required.is_file():
-                raise RuntimeError(
-                    f"{self.name} requires existing input: {required}")
-
         args = [
             f"--input_data={input_data_path}",
             f"--pv_map={pv_map_path}",
@@ -621,6 +616,11 @@ def _prepare_command(self) -> _StepContext:
 
     def run(self) -> None:
         context = self._prepare_command()
+        for required in (context.input_data_path, context.pv_map_path,
+                         context.metadata_path):
+            if not required.is_file():
+                raise RuntimeError(
+                    f"{self.name} requires existing input: {required}")
         # Ensure output directory exists
         context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
         logging.info(
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 2c497197d6..ba4dbeb6a8 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -826,11 +826,7 @@ def test_create_sample_step_caches_plan(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
 
-        # Create test input file to satisfy validation
-        input_path = Path(self._tmpdir) / "demo_data.csv"
-        input_path.write_text("header\nrow1")
-
-        # First call creates context
+        # No input file created, dry run should still succeed
         context1 = step._prepare_command()
         self.assertIn("data_sampler.py", context1.full_command[1])
         self.assertIn("--sampler_output_rows=500", context1.full_command)
@@ -851,7 +847,7 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
 
-        # Create test input file
+        # Create test input file for run()
         input_path = Path(self._tmpdir) / "demo_data.csv"
         input_path.write_text("header\nrow1")
 
@@ -875,7 +871,9 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertIn("data_sampler.py", args[0][1])
             self.assertTrue(kwargs["verbose"])
 
-    def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None:
+            self.assertTrue(kwargs["verbose"])
+
+    def test_create_sample_step_dry_run_succeeds_if_input_missing(self) -> None:
         config = PipelineConfig(
             run=RunConfig(
                 command="test",
@@ -886,11 +884,24 @@ def test_create_sample_step_dry_run_fails_if_input_missing(self) -> None:
             sample=SampleConfig(rows=500),
         )
         step = CreateSampleStep(name="test-step", config=config)
-        # No input file created
+        # No input file created, dry run should still succeed
+        step.dry_run()
 
+    def test_create_sample_step_run_fails_if_input_missing(self) -> None:
+        config = PipelineConfig(
+            run=RunConfig(
+                command="test",
+                dataset_prefix="demo",
+                working_dir=self._tmpdir,
+                verbose=True,
+            ),
+            sample=SampleConfig(rows=500),
+        )
+        step = CreateSampleStep(name="test-step", config=config)
+        # No input file created, run should fail
         with self.assertRaisesRegex(RuntimeError,
                                     "Input file missing for sampling"):
-            step.dry_run()
+            step.run()
 
     def test_create_schema_map_step_caches_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -903,9 +914,7 @@ def test_create_schema_map_step_caches_plan(self) -> None:
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
 
-        # Create test input files to satisfy validation
-        (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
-        (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
+        # No input files created, dry run should still succeed
 
         # First call creates context
         context1 = step._prepare_command()
@@ -926,7 +935,7 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
 
-        # Create test input files
+        # Create test input files for run()
         (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
         (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
 
@@ -950,7 +959,9 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertIn("pvmap_generator.py", args[0][1])
             self.assertTrue(kwargs["verbose"])
 
-    def test_create_schema_map_step_dry_run_fails_if_input_missing(
+            self.assertTrue(kwargs["verbose"])
+
+    def test_create_schema_map_step_dry_run_succeeds_if_input_missing(
             self) -> None:
         config = PipelineConfig(run=RunConfig(
             command="test",
@@ -959,9 +970,20 @@ def test_create_schema_map_step_dry_run_fails_if_input_missing(
             verbose=True,
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
-        # No input files created
+        # No input files created, dry run should still succeed
+        step.dry_run()
+
+    def test_create_schema_map_step_run_fails_if_input_missing(self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = CreateSchemaMapStep(name="test-step", config=config)
+        # No input files created, run should fail
         with self.assertRaises(RuntimeError):
-            step.dry_run()
+            step.run()
 
     def test_process_full_data_step_caches_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -972,8 +994,7 @@ def test_process_full_data_step_caches_plan(self) -> None:
         ),)
         step = ProcessFullDataStep(name="test-step", config=config)
 
-        # Create test files to satisfy validation
-        self._create_test_input_files("demo")
+        # No input files created, dry run should still succeed
 
         context1 = step._prepare_command()
         context2 = step._prepare_command()
@@ -1012,6 +1033,21 @@ def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None:
             self.assertIn("--input_data=", args[0][2])
             self.assertTrue(kwargs["verbose"])
 
+            self.assertIn("--input_data=", args[0][2])
+            self.assertTrue(kwargs["verbose"])
+
+    def test_process_full_data_step_dry_run_succeeds_if_input_missing(
+            self) -> None:
+        config = PipelineConfig(run=RunConfig(
+            command="test",
+            dataset_prefix="demo",
+            working_dir=self._tmpdir,
+            verbose=True,
+        ),)
+        step = ProcessFullDataStep(name="test-step", config=config)
+        # Missing input files, dry run should still succeed
+        step.dry_run()
+
     def test_process_full_data_step_run_fails_if_input_missing(self) -> None:
         config = PipelineConfig(run=RunConfig(
             command="test",
@@ -1020,11 +1056,9 @@ def test_process_full_data_step_run_fails_if_input_missing(self) -> None:
             verbose=True,
         ),)
         step = ProcessFullDataStep(name="test-step", config=config)
-        # Missing input files
+        # Missing input files, run should fail
         with self.assertRaises(RuntimeError):
             step.run()
-        with self.assertRaises(RuntimeError):
-            step.dry_run()
 
     def test_create_dc_config_step_caches_plan(self) -> None:
         config = self._build_config(dataset_prefix="demo",

From 7b4bb8d16652798d9310cb6e33ab45b50c46e57f Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 03:28:22 +0000
Subject: [PATCH 34/54] feat: Introduce `working_dir` flag and encapsulate flag
 definitions to prevent `Duplicate

---
 tools/agentic_import/pvmap_generator.py       | 86 ++++++++++++-------
 tools/agentic_import/pvmap_generator_test.py  | 48 +++++++++++
 tools/agentic_import/sdmx_import_pipeline.py  | 23 +++--
 .../sdmx_import_pipeline_test.py              | 12 +++
 4 files changed, 126 insertions(+), 43 deletions(-)

diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py
index 2bf0217d98..110a8e68a8 100644
--- a/tools/agentic_import/pvmap_generator.py
+++ b/tools/agentic_import/pvmap_generator.py
@@ -33,45 +33,60 @@
 _FLAGS = flags.FLAGS
 _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
-flags.DEFINE_list('input_data', None,
-                  'List of input data file paths (required)')
-flags.mark_flag_as_required('input_data')
 
-# TODO: Allow users to provide original source path and auto-generate sample data files internally
-flags.DEFINE_list('input_metadata', [],
-                  'List of input metadata file paths (optional)')
+def _define_flags():
+    try:
+        flags.DEFINE_list('input_data', None,
+                          'List of input data file paths (required)')
+        flags.mark_flag_as_required('input_data')
 
-flags.DEFINE_boolean('sdmx_dataset', False,
-                     'Whether the dataset is in SDMX format (default: False)')
+        flags.DEFINE_list('input_metadata', [],
+                          'List of input metadata file paths (optional)')
 
-flags.DEFINE_boolean('dry_run', False,
-                     'Generate prompt only without calling Gemini CLI')
+        flags.DEFINE_boolean(
+            'sdmx_dataset', False,
+            'Whether the dataset is in SDMX format (default: False)')
 
-flags.DEFINE_string('maps_api_key', None, 'Google Maps API key (optional)')
+        flags.DEFINE_boolean('dry_run', False,
+                             'Generate prompt only without calling Gemini CLI')
 
-flags.DEFINE_string('dc_api_key', None, 'Data Commons API key (optional)')
+        flags.DEFINE_string('maps_api_key', None,
+                            'Google Maps API key (optional)')
 
-flags.DEFINE_integer('max_iterations', 10,
-                     'Maximum number of attempts for statvar processor.')
+        flags.DEFINE_string('dc_api_key', None,
+                            'Data Commons API key (optional)')
 
-flags.DEFINE_boolean(
-    'skip_confirmation', False,
-    'Skip user confirmation before starting PV map generation')
+        flags.DEFINE_integer(
+            'max_iterations', 10,
+            'Maximum number of attempts for statvar processor.')
 
-flags.DEFINE_boolean(
-    'enable_sandboxing',
-    platform.system() == 'Darwin',
-    'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)'
-)
+        flags.DEFINE_boolean(
+            'skip_confirmation', False,
+            'Skip user confirmation before starting PV map generation')
 
-flags.DEFINE_string(
-    'output_path', 'output/output',
-    'Output path prefix for all generated files (default: output/output)')
+        flags.DEFINE_boolean(
+            'enable_sandboxing',
+            platform.system() == 'Darwin',
+            'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)'
+        )
+
+        flags.DEFINE_string(
+            'output_path', 'output/output',
+            'Output path prefix for all generated files (default: output/output)'
+        )
 
-flags.DEFINE_string(
-    'gemini_cli', 'gemini', 'Custom path or command to invoke Gemini CLI. '
-    'Example: "/usr/local/bin/gemini". '
-    'WARNING: This value is executed in a shell - use only with trusted input.')
+        flags.DEFINE_string(
+            'gemini_cli', 'gemini',
+            'Custom path or command to invoke Gemini CLI. '
+            'Example: "/usr/local/bin/gemini". '
+            'WARNING: This value is executed in a shell - use only with trusted input.'
+        )
+
+        flags.DEFINE_string(
+            'working_dir', None,
+            'Working directory for the generator (default: current directory)')
+    except flags.DuplicateFlagError:
+        pass
 
 
 @dataclass
@@ -93,6 +108,7 @@ class Config:
     enable_sandboxing: bool = False
     output_path: str = 'output/output'
     gemini_cli: Optional[str] = None
+    working_dir: Optional[str] = None
 
 
 @dataclass
@@ -110,7 +126,12 @@ class PVMapGenerator:
 
     def __init__(self, config: Config):
         # Define working directory once for consistent path resolution
-        self._working_dir = Path.cwd()
+        self._working_dir = Path(
+            config.working_dir).resolve() if config.working_dir else Path.cwd()
+        if self._working_dir.exists() and not self._working_dir.is_dir():
+            raise ValueError(
+                f"working_dir is not a directory: {self._working_dir}")
+        self._working_dir.mkdir(parents=True, exist_ok=True)
 
         # Copy config to avoid modifying the original
         self._config = copy.deepcopy(config)
@@ -314,6 +335,7 @@ def _run_subprocess(self, command: str) -> int:
                 stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,  # Combine stderr with stdout
                 shell=True,  # Using shell to support pipe operations
+                cwd=self._working_dir,  # Run in the specified working directory
                 encoding='utf-8',
                 errors='replace',
                 bufsize=1,  # Line buffered
@@ -406,7 +428,8 @@ def prepare_config() -> Config:
                   skip_confirmation=_FLAGS.skip_confirmation,
                   enable_sandboxing=_FLAGS.enable_sandboxing,
                   output_path=_FLAGS.output_path,
-                  gemini_cli=_FLAGS.gemini_cli)
+                  gemini_cli=_FLAGS.gemini_cli,
+                  working_dir=_FLAGS.working_dir)
 
 
 def main(_):
@@ -424,4 +447,5 @@ def main(_):
 
 
 if __name__ == '__main__':
+    _define_flags()
     app.run(main)
diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py
index 32ad242787..293ecafa47 100644
--- a/tools/agentic_import/pvmap_generator_test.py
+++ b/tools/agentic_import/pvmap_generator_test.py
@@ -178,6 +178,54 @@ def test_rejects_paths_outside_working_directory(self):
                         input_data=[str(external_file)], input_metadata=[]),
                            dry_run=True))
 
+    def test_generate_prompt_with_relative_working_dir(self):
+        # Create a subdirectory for the relative working directory test
+        sub_dir_name = 'sub_working_dir'
+        sub_dir = Path(self._temp_dir.name) / sub_dir_name
+        sub_dir.mkdir()
+
+        # Create input files inside the subdirectory
+        data_file = sub_dir / 'input.csv'
+        data_file.write_text('header\nvalue')
+        metadata_file = sub_dir / 'metadata.csv'
+        metadata_file.write_text('parameter,value')
+
+        # Use relative path for working_dir
+        config = Config(
+            data_config=DataConfig(
+                input_data=[
+                    str(data_file.relative_to(Path(self._temp_dir.name)))
+                ],  # Relative to PWD
+                input_metadata=[
+                    str(metadata_file.relative_to(Path(self._temp_dir.name)))
+                ],  # Relative to PWD
+                is_sdmx_dataset=False,
+            ),
+            dry_run=True,
+            max_iterations=3,
+            output_path='output/output_file',
+            working_dir=sub_dir_name,  # Relative path
+        )
+
+        # We need to run from the parent directory so the relative path is valid
+        # The setUp already changed to self._temp_dir.name, so we are in the right place
+
+        generator = PVMapGenerator(config)
+        result = generator.generate()
+
+        self._assert_generation_result(result)
+        prompt_path = self._read_prompt_path(result)
+        prompt_text = prompt_path.read_text()
+
+        # Verify that the working directory in the prompt is the absolute path of the subdirectory
+        expected_working_dir = str(sub_dir.resolve())
+        self.assertIn(expected_working_dir, prompt_text)
+        self.assertIn(f'"working_dir": "{expected_working_dir}"', prompt_text)
+
+        # Verify input paths are also absolute in the prompt
+        self.assertIn(str(data_file.resolve()), prompt_text)
+        self.assertIn(str(metadata_file.resolve()), prompt_text)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 1a6b81dd71..d3224e587a 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -183,12 +183,10 @@ def _compute_critical_input_hash(config: PipelineConfig) -> str:
 
 
 def _resolve_working_dir(config: PipelineConfig) -> Path:
-    directory = Path(config.run.working_dir or os.getcwd())
-    if directory.exists():
-        if not directory.is_dir():
-            raise ValueError(f"working_dir is not a directory: {directory}")
-    else:
-        directory.mkdir(parents=True, exist_ok=True)
+    directory = Path(config.run.working_dir or os.getcwd()).resolve()
+    if directory.exists() and not directory.is_dir():
+        raise ValueError(f"working_dir is not a directory: {directory}")
+    directory.mkdir(parents=True, exist_ok=True)
     return directory
 
 
@@ -350,7 +348,7 @@ def _prepare_command(self) -> _StepContext:
         dataflow = _require_config_field(self._config.sdmx.dataflow.id,
                                          _FLAG_SDMX_DATAFLOW_ID, self.name)
         dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir)
+        working_dir = Path(self._config.run.working_dir).resolve()
         output_path = working_dir / f"{dataset_prefix}_data.csv"
         args = [
             "download-data",
@@ -411,7 +409,7 @@ def _prepare_command(self) -> _StepContext:
         dataflow = _require_config_field(self._config.sdmx.dataflow.id,
                                          _FLAG_SDMX_DATAFLOW_ID, self.name)
         dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir)
+        working_dir = Path(self._config.run.working_dir).resolve()
         output_path = working_dir / f"{dataset_prefix}_metadata.xml"
         args = [
             "download-metadata",
@@ -463,7 +461,7 @@ def _prepare_command(self) -> _StepContext:
         if self._context:
             return self._context
         dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir)
+        working_dir = Path(self._config.run.working_dir).resolve()
         input_path = working_dir / f"{dataset_prefix}_data.csv"
         output_path = working_dir / f"{dataset_prefix}_sample.csv"
 
@@ -518,7 +516,7 @@ def _prepare_command(self) -> _StepContext:
         if self._context:
             return self._context
         dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir)
+        working_dir = Path(self._config.run.working_dir).resolve()
         sample_path = working_dir / f"{dataset_prefix}_sample.csv"
         metadata_path = working_dir / f"{dataset_prefix}_metadata.xml"
         output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix
@@ -533,6 +531,7 @@ def _prepare_command(self) -> _StepContext:
             args.append("--skip_confirmation")
         if self._config.run.gemini_cli:
             args.append(f"--gemini_cli={self._config.run.gemini_cli}")
+        args.append(f"--working_dir={working_dir}")
 
         full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args
         self._context = CreateSchemaMapStep._StepContext(
@@ -587,7 +586,7 @@ def _prepare_command(self) -> _StepContext:
         if self._context:
             return self._context
         dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir)
+        working_dir = Path(self._config.run.working_dir).resolve()
         input_data_path = working_dir / f"{dataset_prefix}_data.csv"
         pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR /
                        f"{dataset_prefix}_pvmap.csv")
@@ -655,7 +654,7 @@ def _prepare_command(self) -> _StepContext:
         if self._context:
             return self._context
         dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir)
+        working_dir = Path(self._config.run.working_dir).resolve()
         input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv"
         output_config = (working_dir / FINAL_OUTPUT_DIR /
                          f"{dataset_prefix}_config.json")
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index ba4dbeb6a8..7b050d80fd 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -710,6 +710,7 @@ def test_download_metadata_step_caches_plan(self) -> None:
         # Second call returns same object
         context2 = step._prepare_command()
         self.assertIs(context1, context2)
+        self.assertTrue(context1.output_path.is_absolute())
 
     def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -777,6 +778,7 @@ def test_download_data_step_caches_plan(self) -> None:
         # Second call returns same object
         context2 = step._prepare_command()
         self.assertIs(context1, context2)
+        self.assertTrue(context1.output_path.is_absolute())
 
     def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -834,6 +836,7 @@ def test_create_sample_step_caches_plan(self) -> None:
         # Second call returns same object
         context2 = step._prepare_command()
         self.assertIs(context1, context2)
+        self.assertTrue(context1.output_path.is_absolute())
 
     def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -925,6 +928,9 @@ def test_create_schema_map_step_caches_plan(self) -> None:
         # Second call returns same object
         context2 = step._prepare_command()
         self.assertIs(context1, context2)
+        self.assertTrue(context1.sample_path.is_absolute())
+        self.assertTrue(context1.metadata_path.is_absolute())
+        self.assertTrue(context1.output_prefix.is_absolute())
 
     def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -999,6 +1005,10 @@ def test_process_full_data_step_caches_plan(self) -> None:
         context1 = step._prepare_command()
         context2 = step._prepare_command()
         self.assertIs(context1, context2)
+        self.assertTrue(context1.input_data_path.is_absolute())
+        self.assertTrue(context1.pv_map_path.is_absolute())
+        self.assertTrue(context1.metadata_path.is_absolute())
+        self.assertTrue(context1.output_prefix.is_absolute())
 
     def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -1069,6 +1079,8 @@ def test_create_dc_config_step_caches_plan(self) -> None:
         context1 = step._prepare_command()
         context2 = step._prepare_command()
         self.assertIs(context1, context2)
+        self.assertTrue(context1.input_csv.is_absolute())
+        self.assertTrue(context1.output_config.is_absolute())
 
     def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None:
         config = self._build_config(dataset_prefix="demo",

From 5fd7e25830d5d29a04f9f3cbdefef294462a2c9f Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 03:57:58 +0000
Subject: [PATCH 35/54] docs: clarify SDMX import pipeline prerequisites,
 usage, step names, and state management.

---
 tools/agentic_import/sdmx_import_pipeline.md | 41 +++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.md b/tools/agentic_import/sdmx_import_pipeline.md
index 050f2e60aa..32f7eae252 100644
--- a/tools/agentic_import/sdmx_import_pipeline.md
+++ b/tools/agentic_import/sdmx_import_pipeline.md
@@ -7,29 +7,38 @@ The SDMX Agentic Import Pipeline is a Python-based system designed to automate t
 The pipeline orchestrates several tools to handle the end-to-end import process:
 1.  **Download**: Retrieves data and metadata from SDMX endpoints.
 2.  **Sample**: Creates a manageable sample of the data for analysis.
-3.  **Map**: Generates Property-Value (PV) mappings using LLM-based tools.
-4.  **Process**: Converts the full dataset into Data Commons MCF and CSV formats.
-5.  **Config**: Generates configuration for custom Data Commons instances.
+3.  **Schema Mapping**: Generates Property-Value (PV) mappings using LLM-based tools.
+4.  **Full Data Processing**: Converts the full dataset into Data Commons MCF and CSV formats.
+5.  **Custom DC Config**: Generates configuration for custom Data Commons instances.
 
 ## Prerequisites
 
-Before running the pipeline, ensure you have:
-1.  **Python Environment**: Set up as described in the [main README](./README.md#step-2-environment-setup).
-2.  **Gemini CLI**: Installed and configured for schema mapping.
-3.  **Data Commons API Key**: Set in your environment.
+Before running the pipeline, ensure you have set up your environment as described in the [main README](./README.md#step-2-environment-setup). Key requirements include:
+
+1.  **DC_DATA_REPO_PATH**: Environment variable pointing to your cloned Data Commons data repository.
+2.  **WORKING_DIR**: Environment variable pointing to your working directory.
+3.  **Python Environment**: Activated virtual environment with required dependencies.
+4.  **Gemini CLI**: Installed and configured for schema mapping.
+5.  **Data Commons API Key**: Set in your environment.
 
 ## Usage
 
 The pipeline is executed using the `sdmx_import_pipeline.py` script.
 
+**Important:** The command must be run from within your working directory.
+
 ### Basic Command
 
 ```bash
-python tools/agentic_import/sdmx_import_pipeline.py \
+# Ensure you are in your working directory
+cd $WORKING_DIR
+
+# Run the pipeline using the full path to the script
+python $DC_DATA_REPO_PATH/tools/agentic_import/sdmx_import_pipeline.py \
   --sdmx.endpoint="https://sdmx.example.org/data" \
   --sdmx.agency="AGENCY_ID" \
   --sdmx.dataflow.id="DATAFLOW_ID" \
-  --working_dir="/path/to/working/dir"
+  --dataset_prefix="my_dataset"
 ```
 
 ### Key Flags
@@ -39,9 +48,10 @@ python tools/agentic_import/sdmx_import_pipeline.py \
 -   `--sdmx.dataflow.id`: The SDMX dataflow ID.
 -   `--sdmx.dataflow.key`: (Optional) Filter key for data download.
 -   `--sdmx.dataflow.param`: (Optional) Additional parameters for data download.
--   `--working_dir`: Directory for input and output files.
+-   `--dataset_prefix`: (Optional) Prefix for generated artifacts. Useful for disambiguating multiple datasets in the same working directory. If not provided, it is derived from the dataflow ID.
 -   `--sample.rows`: Number of rows for the sample dataset (default: 1000).
 -   `--force`: Force re-execution of all steps, ignoring saved state.
+-   `--skip_confirmation`: Skip interactive confirmation prompts during schema mapping.
 -   `--verbose`: Enable verbose logging.
 
 ## Pipeline Steps
@@ -57,14 +67,15 @@ The pipeline consists of the following steps, executed in order:
 
 ## Directory Structure
 
-The pipeline organizes outputs within the specified `--working_dir`:
+The pipeline organizes outputs within the specified working directory:
 
 ```
 working_dir/
 ├── <dataset_prefix>_data.csv          # Raw downloaded data
 ├── <dataset_prefix>_metadata.xml      # Raw downloaded metadata
 ├── <dataset_prefix>_sample.csv        # Sampled data
-├── .state.json                        # Pipeline state for resuming runs
+├── .datacommons/
+│   └── <dataset_prefix>.state.json    # Pipeline state for resuming runs
 ├── sample_output/                     # Intermediate artifacts from mapping
 │   ├── <dataset_prefix>_pvmap.csv
 │   └── <dataset_prefix>_metadata.csv
@@ -77,13 +88,13 @@ working_dir/
 
 ## State Management
 
-The pipeline automatically saves its state to a `.state.json` file in the working directory.
+The pipeline automatically saves its state to a `<dataset_prefix>.state.json` file in the `.datacommons/` directory within your working directory.
 -   **Resuming**: If a run is interrupted, running the same command again will resume from the last successful step.
 -   **Skipping**: Steps that have already completed successfully will be skipped unless `--force` is used.
 -   **Input Hashing**: The pipeline tracks input configuration. If critical configuration changes, it may trigger re-execution of steps.
 
 ## Troubleshooting
 
--   **Gemini CLI Errors**: If the schema mapping step fails, check the Gemini CLI logs (usually in `.datacommons/runs/` within the working directory).
+-   **Gemini CLI Errors**: If the schema mapping step fails, check the Gemini CLI logs (usually in `.datacommons/runs/` within the working directory). Refer to the [main README](./README.md#debugging) for detailed debugging instructions.
 -   **Missing Data**: Ensure the SDMX endpoint, agency, and dataflow ID are correct. Use `--verbose` to see the exact commands being run.
--   **State Issues**: If the pipeline is stuck or behaving unexpectedly, you can delete `.state.json` to reset the state, or use `--force`.
+-   **State Issues**: If the pipeline is stuck or behaving unexpectedly, you can delete the state file to reset the state, or use `--force`.

From eff87058f257e713fc06bedbfe0dd2efe511a5c7 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 04:10:41 +0000
Subject: [PATCH 36/54] refactor: extract common SDMX test setup and helper
 methods into a new `SdmxTestBase` class.

---
 .../sdmx_import_pipeline_test.py              | 110 +++++++-----------
 1 file changed, 45 insertions(+), 65 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 7b050d80fd..73bad6e7ba 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -455,14 +455,37 @@ def test_incremental_records_skip_reasons(self) -> None:
             self.assertIn("up-to-date", decision.reason)
 
 
-class RunPipelineTest(unittest.TestCase):
+class SdmxTestBase(unittest.TestCase):
 
-    def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None,
-                      command: str) -> PipelineConfig:
+    def setUp(self) -> None:
+        self._tmpdir_obj = tempfile.TemporaryDirectory()
+        self.addCleanup(self._tmpdir_obj.cleanup)
+        self._tmpdir = self._tmpdir_obj.name
+
+    def _create_test_input_files(self, prefix: str) -> None:
+        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
+        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
+        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
+
+        sample_output_dir = Path(self._tmpdir) / "sample_output"
+        sample_output_dir.mkdir(parents=True, exist_ok=True)
+        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
+        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
+
+        output_dir = Path(self._tmpdir) / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        (output_dir / f"{prefix}.csv").write_text("output")
+
+    def _build_config(self,
+                      dataset_prefix: str | None,
+                      dataflow: str | None = "FLOW",
+                      command: str = "test",
+                      endpoint: str = "https://example.com",
+                      agency: str = "AGENCY") -> PipelineConfig:
         return PipelineConfig(
             sdmx=SdmxConfig(
-                endpoint="https://api.example.com",
-                agency="TEST_AGENCY",
+                endpoint=endpoint,
+                agency=agency,
                 dataflow=SdmxDataflowConfig(
                     id=dataflow,
                     key="test-key",
@@ -477,30 +500,17 @@ def _build_config(self, *, dataset_prefix: str | None, dataflow: str | None,
             ),
         )
 
+
+class RunPipelineTest(SdmxTestBase):
+
     def setUp(self) -> None:
-        self._tmpdir_obj = tempfile.TemporaryDirectory()
-        self.addCleanup(self._tmpdir_obj.cleanup)
-        self._tmpdir = self._tmpdir_obj.name
+        super().setUp()
         # Mock _run_command to avoid actual execution during pipeline tests
         self._run_command_patcher = mock.patch(
             "tools.agentic_import.sdmx_import_pipeline._run_command")
         self._mock_run_command = self._run_command_patcher.start()
         self.addCleanup(self._run_command_patcher.stop)
 
-    def _create_test_input_files(self, prefix: str) -> None:
-        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
-        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
-        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
-
-        sample_output_dir = Path(self._tmpdir) / "sample_output"
-        sample_output_dir.mkdir(parents=True, exist_ok=True)
-        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
-        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
-
-        output_dir = Path(self._tmpdir) / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / f"{prefix}.csv").write_text("output")
-
     def test_run_pipeline_updates_state_and_hash(self) -> None:
         command = "sdmx run pipeline"
         config = self._build_config(dataset_prefix="demo",
@@ -641,39 +651,7 @@ def test_hash_unchanged_skips_rerun(self) -> None:
         self.assertEqual(first_state, second_state)
 
 
-class SdmxStepTest(unittest.TestCase):
-
-    def setUp(self) -> None:
-        self._tmpdir_obj = tempfile.TemporaryDirectory()
-        self.addCleanup(self._tmpdir_obj.cleanup)
-        self._tmpdir = self._tmpdir_obj.name
-
-    def _create_test_input_files(self, prefix: str) -> None:
-        (Path(self._tmpdir) / f"{prefix}_data.csv").write_text("data")
-        (Path(self._tmpdir) / f"{prefix}_sample.csv").write_text("sample")
-        (Path(self._tmpdir) / f"{prefix}_metadata.xml").write_text("metadata")
-
-        sample_output_dir = Path(self._tmpdir) / "sample_output"
-        sample_output_dir.mkdir(parents=True, exist_ok=True)
-        (sample_output_dir / f"{prefix}_pvmap.csv").write_text("pvmap")
-        (sample_output_dir / f"{prefix}_metadata.csv").write_text("metadata")
-
-        output_dir = Path(self._tmpdir) / "output"
-        output_dir.mkdir(parents=True, exist_ok=True)
-        (output_dir / f"{prefix}.csv").write_text("output")
-
-    def _build_config(self,
-                      dataset_prefix: str | None,
-                      endpoint: str = "https://example.com",
-                      agency: str = "AGENCY",
-                      dataflow: str = "FLOW") -> PipelineConfig:
-        return PipelineConfig(sdmx=SdmxConfig(
-            endpoint=endpoint,
-            agency=agency,
-            dataflow=SdmxDataflowConfig(id=dataflow)),
-                              run=RunConfig(command="test",
-                                            dataset_prefix=dataset_prefix,
-                                            working_dir=self._tmpdir))
+class SdmxStepTest(SdmxTestBase):
 
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
@@ -830,7 +808,8 @@ def test_create_sample_step_caches_plan(self) -> None:
 
         # No input file created, dry run should still succeed
         context1 = step._prepare_command()
-        self.assertIn("data_sampler.py", context1.full_command[1])
+        self.assertTrue(
+            any("data_sampler.py" in arg for arg in context1.full_command))
         self.assertIn("--sampler_output_rows=500", context1.full_command)
 
         # Second call returns same object
@@ -871,7 +850,7 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
             # Verify run called the command with the same args
             mock_run_cmd.assert_called_once()
             args, kwargs = mock_run_cmd.call_args
-            self.assertIn("data_sampler.py", args[0][1])
+            self.assertTrue(any("data_sampler.py" in arg for arg in args[0]))
             self.assertTrue(kwargs["verbose"])
 
             self.assertTrue(kwargs["verbose"])
@@ -921,7 +900,8 @@ def test_create_schema_map_step_caches_plan(self) -> None:
 
         # First call creates context
         context1 = step._prepare_command()
-        self.assertIn("pvmap_generator.py", context1.full_command[1])
+        self.assertTrue(
+            any("pvmap_generator.py" in arg for arg in context1.full_command))
         self.assertIn("--gemini_cli=custom-gemini", context1.full_command)
         self.assertIn("--skip_confirmation", context1.full_command)
 
@@ -962,7 +942,7 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
             # Verify run called the command with the same args
             mock_run_cmd.assert_called_once()
             args, kwargs = mock_run_cmd.call_args
-            self.assertIn("pvmap_generator.py", args[0][1])
+            self.assertTrue(any("pvmap_generator.py" in arg for arg in args[0]))
             self.assertTrue(kwargs["verbose"])
 
             self.assertTrue(kwargs["verbose"])
@@ -1039,11 +1019,10 @@ def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None:
             # Verify run called the command with the same args
             mock_run_cmd.assert_called_once()
             args, kwargs = mock_run_cmd.call_args
-            self.assertIn("stat_var_processor.py", args[0][1])
-            self.assertIn("--input_data=", args[0][2])
-            self.assertTrue(kwargs["verbose"])
-
-            self.assertIn("--input_data=", args[0][2])
+            self.assertTrue(
+                any("stat_var_processor.py" in arg for arg in args[0]))
+            self.assertTrue(
+                any(arg.startswith("--input_data=") for arg in args[0]))
             self.assertTrue(kwargs["verbose"])
 
     def test_process_full_data_step_dry_run_succeeds_if_input_missing(
@@ -1102,7 +1081,8 @@ def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None:
             mock_run_cmd.assert_called_once()
             args, kwargs = mock_run_cmd.call_args
             command = args[0]
-            self.assertIn("generate_custom_dc_config.py", command[1])
+            self.assertTrue(
+                any("generate_custom_dc_config.py" in arg for arg in command))
             self.assertIn(f"--input_csv={final_output_dir}/demo.csv", command)
             self.assertIn(
                 f"--output_config={final_output_dir}/demo_config.json", command)

From 11abd361efaf112b6b5ade660bd4a05f154ab8d0 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 04:55:09 +0000
Subject: [PATCH 37/54] feat: Resolve relative input and output paths against
 the working directory in PVMapGenerator.

---
 tools/agentic_import/pvmap_generator.py      | 14 +++++---
 tools/agentic_import/pvmap_generator_test.py | 38 ++++++++++++++++----
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py
index 110a8e68a8..60f4d9a955 100644
--- a/tools/agentic_import/pvmap_generator.py
+++ b/tools/agentic_import/pvmap_generator.py
@@ -151,13 +151,16 @@ def __init__(self, config: Config):
             ]
 
         # Parse output_path into directory and basename components
-        output_path = Path(self._config.output_path)
+        # Parse output_path, handling relative paths and ~ expansion
+        output_path = Path(self._config.output_path).expanduser()
+        if not output_path.is_absolute():
+            output_path = self._working_dir / output_path
+
         self._output_dir = output_path.parent
         self._output_basename = output_path.name
 
         # Create output directory if it doesn't exist
-        output_full_dir = self._working_dir / self._output_dir
-        output_full_dir.mkdir(parents=True, exist_ok=True)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
 
         self._datacommons_dir = self._initialize_datacommons_dir()
 
@@ -171,7 +174,10 @@ def __init__(self, config: Config):
 
     def _validate_and_convert_path(self, path: str) -> Path:
         """Convert path to absolute and validate it's within working directory."""
-        real_path = Path(path).expanduser().resolve()
+        p = Path(path).expanduser()
+        if not p.is_absolute():
+            p = self._working_dir / p
+        real_path = p.resolve()
         working_dir = self._working_dir.resolve()
         try:
             real_path.relative_to(working_dir)
diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py
index 293ecafa47..0095770065 100644
--- a/tools/agentic_import/pvmap_generator_test.py
+++ b/tools/agentic_import/pvmap_generator_test.py
@@ -193,12 +193,8 @@ def test_generate_prompt_with_relative_working_dir(self):
         # Use relative path for working_dir
         config = Config(
             data_config=DataConfig(
-                input_data=[
-                    str(data_file.relative_to(Path(self._temp_dir.name)))
-                ],  # Relative to PWD
-                input_metadata=[
-                    str(metadata_file.relative_to(Path(self._temp_dir.name)))
-                ],  # Relative to PWD
+                input_data=['input.csv'],  # Relative to working_dir
+                input_metadata=['metadata.csv'],  # Relative to working_dir
                 is_sdmx_dataset=False,
             ),
             dry_run=True,
@@ -226,6 +222,36 @@ def test_generate_prompt_with_relative_working_dir(self):
         self.assertIn(str(data_file.resolve()), prompt_text)
         self.assertIn(str(metadata_file.resolve()), prompt_text)
 
+    def test_relative_paths_resolved_against_working_dir(self):
+        # Create a separate working directory
+        with tempfile.TemporaryDirectory() as work_dir:
+            work_path = Path(work_dir)
+            # Create input files inside the working directory
+            data_file = work_path / 'input.csv'
+            data_file.write_text('header\nvalue')
+
+            # Run from a different directory (current temp dir)
+            # Use relative path to input file, which should be resolved against work_dir
+            config = Config(
+                data_config=DataConfig(
+                    input_data=['input.csv'],  # Relative to work_dir
+                    input_metadata=[],
+                    is_sdmx_dataset=False,
+                ),
+                dry_run=True,
+                working_dir=work_dir,
+            )
+
+            # This should not raise ValueError because input.csv is found in work_dir
+            generator = PVMapGenerator(config)
+            result = generator.generate()
+            self._assert_generation_result(result)
+            self.assertEqual(str(generator._config.data_config.input_data[0]),
+                             str(data_file.resolve()))
+            # Verify output directory is also under working_dir
+            self.assertTrue(
+                str(generator._output_dir).startswith(str(work_path.resolve())))
+
 
 if __name__ == '__main__':
     unittest.main()

From b08f8ca4a26e2891e1d5d8150bbc563de2cafc61 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 06:15:12 +0000
Subject: [PATCH 38/54] feat: Add working directory flag to SDMX import
 pipeline and ensure absolute output path in pvmap generator.

---
 tools/agentic_import/pvmap_generator.py      | 1 +
 tools/agentic_import/pvmap_generator_test.py | 3 +++
 tools/agentic_import/sdmx_import_pipeline.py | 5 ++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py
index 60f4d9a955..cdbac209fe 100644
--- a/tools/agentic_import/pvmap_generator.py
+++ b/tools/agentic_import/pvmap_generator.py
@@ -158,6 +158,7 @@ def __init__(self, config: Config):
 
         self._output_dir = output_path.parent
         self._output_basename = output_path.name
+        self._config.output_path = str(output_path)
 
         # Create output directory if it doesn't exist
         self._output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py
index 0095770065..cad6f360ab 100644
--- a/tools/agentic_import/pvmap_generator_test.py
+++ b/tools/agentic_import/pvmap_generator_test.py
@@ -116,6 +116,9 @@ def _assert_prompt_content(self, prompt_path: Path, *, expect_sdmx: bool,
         self.assertIn(f'You have exactly {config.max_iterations} attempts',
                       prompt_text)
 
+        # Output path should be absolute in the prompt
+        self.assertIn(f'--output-path "{config.output_path}"', prompt_text)
+
         if expect_sdmx:
             # SDMX prompts highlight dataset type and show SDMX-specific banner.
             self.assertIn('"dataset_type": "sdmx"', prompt_text)
diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index d3224e587a..3c933f5d5a 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -939,7 +939,7 @@ def prepare_config() -> PipelineConfig:
         run=RunConfig(
             command=command,
             dataset_prefix=FLAGS.dataset_prefix,
-            working_dir=None,
+            working_dir=FLAGS.working_dir,
             run_only=FLAGS.run_only,
             force=FLAGS.force,
             verbose=FLAGS.verbose,
@@ -988,6 +988,9 @@ def _define_flags() -> None:
     flags.DEFINE_string("gemini_cli", "gemini",
                         "Path to Gemini CLI executable.")
 
+    flags.DEFINE_string("working_dir", None,
+                        "Working directory for the pipeline.")
+
 
 def main(_: list[str]) -> int:
     config = prepare_config()

From cd9041a403f4b6d42bd0b46d7058210800291fec Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 06:19:27 +0000
Subject: [PATCH 39/54] docs: Consolidate comments for output_path parsing
 logic.

---
 tools/agentic_import/pvmap_generator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py
index cdbac209fe..0edbd0f88b 100644
--- a/tools/agentic_import/pvmap_generator.py
+++ b/tools/agentic_import/pvmap_generator.py
@@ -150,8 +150,7 @@ def __init__(self, config: Config):
                 for path in self._config.data_config.input_metadata
             ]
 
-        # Parse output_path into directory and basename components
-        # Parse output_path, handling relative paths and ~ expansion
+        # Parse output_path into directory and basename components, handling relative paths and ~ expansion
         output_path = Path(self._config.output_path).expanduser()
         if not output_path.is_absolute():
             output_path = self._working_dir / output_path

From 650800bf5d85e5797323fb2e40d8576c74b3f5dc Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 06:46:37 +0000
Subject: [PATCH 40/54] fix: enforce absolute generator paths

Point prompt vars and tests at output_path_abs and resolved working/script dirs
---
 tools/agentic_import/pvmap_generator.py       | 33 +++++-----
 tools/agentic_import/pvmap_generator_test.py  |  3 +-
 .../templates/generate_pvmap_prompt.j2        | 66 +++++++++----------
 3 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py
index 0edbd0f88b..eb1cdf5bdb 100644
--- a/tools/agentic_import/pvmap_generator.py
+++ b/tools/agentic_import/pvmap_generator.py
@@ -150,17 +150,18 @@ def __init__(self, config: Config):
                 for path in self._config.data_config.input_metadata
             ]
 
-        # Parse output_path into directory and basename components, handling relative paths and ~ expansion
+        # Parse output_path into absolute path, handling relative paths and ~ expansion
         output_path = Path(self._config.output_path).expanduser()
         if not output_path.is_absolute():
             output_path = self._working_dir / output_path
+        self._output_path_abs = output_path.resolve()
 
-        self._output_dir = output_path.parent
-        self._output_basename = output_path.name
-        self._config.output_path = str(output_path)
+        self._output_dir_abs = self._output_path_abs.parent
+        self._output_basename = self._output_path_abs.name
+        self._config.output_path = str(self._output_path_abs)
 
         # Create output directory if it doesn't exist
-        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._output_dir_abs.mkdir(parents=True, exist_ok=True)
 
         self._datacommons_dir = self._initialize_datacommons_dir()
 
@@ -211,7 +212,7 @@ def _get_user_confirmation(self, prompt_file: Path) -> bool:
         print(f"Generated prompt: {prompt_file}")
         print(f"Working directory: {self._working_dir}")
         print(f"Output path: {self._config.output_path}")
-        print(f"Output directory: {self._output_dir}")
+        print(f"Output directory: {self._output_dir_abs}")
         print(f"Output basename: {self._output_basename}")
         print(
             f"Sandboxing: {'Enabled' if self._config.enable_sandboxing else 'Disabled'}"
@@ -376,21 +377,21 @@ def _generate_prompt(self) -> Path:
         template = env.get_template('generate_pvmap_prompt.j2')
 
         # Calculate paths and prepare template variables
-        working_dir = str(self._working_dir)  # Use defined working directory
+        working_dir = str(self._working_dir)  # Absolute working directory
         # Point to tools/ directory (parent of agentic_import)
-        tools_dir = os.path.abspath(os.path.join(_SCRIPT_DIR, '..'))
+        tools_dir = os.path.abspath(os.path.join(_SCRIPT_DIR, '..'))  # Absolute
 
         template_vars = {
-            'working_dir':
+            'working_dir_abs':
                 working_dir,
             'python_interpreter':
                 sys.executable,
-            'script_dir':
+            'script_dir_abs':
                 tools_dir,
-            'input_data':
+            'input_data_abs':
                 str(self._config.data_config.input_data[0])
                 if self._config.data_config.input_data else "",
-            'input_metadata': [
+            'input_metadata_abs': [
                 str(path) for path in self._config.data_config.input_metadata
             ] if self._config.data_config.input_metadata else [],
             'dataset_type':
@@ -400,10 +401,10 @@ def _generate_prompt(self) -> Path:
             'gemini_run_id':
                 self.
                 _gemini_run_id,  # Pass the gemini run ID for backup tracking
-            'output_path':
-                self._config.output_path,  # Full path for statvar processor
-            'output_dir':
-                str(self._output_dir),  # Directory for pvmap/metadata files
+            'output_path_abs':
+                str(self._output_path_abs),  # Absolute path prefix for outputs
+            'output_dir_abs':
+                str(self._output_dir_abs),  # Directory for pvmap/metadata files
             'output_basename':
                 self._output_basename  # Base name for pvmap/metadata files
         }
diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py
index cad6f360ab..5323a684f8 100644
--- a/tools/agentic_import/pvmap_generator_test.py
+++ b/tools/agentic_import/pvmap_generator_test.py
@@ -253,7 +253,8 @@ def test_relative_paths_resolved_against_working_dir(self):
                              str(data_file.resolve()))
             # Verify output directory is also under working_dir
             self.assertTrue(
-                str(generator._output_dir).startswith(str(work_path.resolve())))
+                str(generator._output_dir_abs).startswith(
+                    str(work_path.resolve())))
 
 
 if __name__ == '__main__':
diff --git a/tools/agentic_import/templates/generate_pvmap_prompt.j2 b/tools/agentic_import/templates/generate_pvmap_prompt.j2
index 99c6445d13..8fafc6ab47 100644
--- a/tools/agentic_import/templates/generate_pvmap_prompt.j2
+++ b/tools/agentic_import/templates/generate_pvmap_prompt.j2
@@ -1242,7 +1242,7 @@ When working with SDMX datasets, follow these additional guidelines:
 # CORE TASK
 
 Your primary goal is to analyze the provided CSV data and generate a complete
-and valid `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv` files which can be used with Statvar
+and valid `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv` files which can be used with Statvar
 processor tool to produce the final DataCommons artifacts.
 
 ## 📌 IMPORTANT: FILE NAMING CONVENTION
@@ -1262,18 +1262,18 @@ This naming convention allows multiple datasets to be processed in the same work
 - ✅ No file conflicts or overwrites
 - ✅ Easy to organize outputs by topic or date
 
-**Current Task**: For this specific run, your output path is `{{output_path}}`.
+**Current Task**: For this specific run, your output path is `{{output_path_abs}}`.
 
 Throughout this documentation, you will see references to generic file names.
 **You MUST use the following specific file names for this task:**
 
 | Documentation Reference | Actual File You Must Create |
 |------------------------|----------------------------|
-| `pvmap.csv` | `{{output_dir}}/{{output_basename}}_pvmap.csv` |
-| `metadata.csv` | `{{output_dir}}/{{output_basename}}_metadata.csv` |
-| `output.csv` | `{{output_path}}.csv` |
+| `pvmap.csv` | `{{output_path_abs}}_pvmap.csv` |
+| `metadata.csv` | `{{output_path_abs}}_metadata.csv` |
+| `output.csv` | `{{output_path_abs}}.csv` |
 
-**Example**: When the documentation says "create pvmap.csv", you must actually create `{{output_dir}}/{{output_basename}}_pvmap.csv`
+**Example**: When the documentation says "create pvmap.csv", you must actually create `{{output_path_abs}}_pvmap.csv`
 
 **REMEMBER**: Whenever you see generic file names in the instructions, always use the specific names with the output path prefix.
 
@@ -1343,11 +1343,11 @@ Follow these steps sequentially.
 
   {%- endif %}
 
-**2. Generate `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv`**
+**2. Generate `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv`**
 
-- Create the `{{output_dir}}/{{output_basename}}_pvmap.csv` file, mapping the source data columns to DataCommons properties based on your findings.
-- Create the `{{output_dir}}/{{output_basename}}_metadata.csv` file and define the necessary `statvar_processor` configuration parameters within it.
-- Configuration rule: All processor flags/settings must live in `{{output_dir}}/{{output_basename}}_metadata.csv`. Do not embed configuration in `{{output_dir}}/{{output_basename}}_pvmap.csv` and do not rely on extra CLI flags.
+- Create the `{{output_path_abs}}_pvmap.csv` file, mapping the source data columns to DataCommons properties based on your findings.
+- Create the `{{output_path_abs}}_metadata.csv` file and define the necessary `statvar_processor` configuration parameters within it.
+- Configuration rule: All processor flags/settings must live in `{{output_path_abs}}_metadata.csv`. Do not embed configuration in `{{output_path_abs}}_pvmap.csv` and do not rely on extra CLI flags.
 ### Validation Checklist
 
 While generating the files, ensure:
@@ -1365,9 +1365,9 @@ While generating the files, ensure:
 - [ ] **Special/missing values mapped appropriately** - Use `#ignore` ONLY to drop entire rows. For skipping individual cell values, use empty mapping: `column:value,IntermediateProperty,''` (preserves row, skips cell)
 
 #### Metadata CSV Validation:
-- [ ] **{{output_dir}}/{{output_basename}}_metadata.csv covers processor flags** - Includes required parameters (e.g., `header_rows`)
-- [ ] **No config in {{output_dir}}/{{output_basename}}_pvmap.csv** - `{{output_dir}}/{{output_basename}}_pvmap.csv` contains only PV mappings, not processor settings
-- [ ] **No extra CLI flags** - Configuration is exclusively in `{{output_dir}}/{{output_basename}}_metadata.csv`; wrapper provides input paths
+- [ ] **{{output_path_abs}}_metadata.csv covers processor flags** - Includes required parameters (e.g., `header_rows`)
+- [ ] **No config in {{output_path_abs}}_pvmap.csv** - `{{output_path_abs}}_pvmap.csv` contains only PV mappings, not processor settings
+- [ ] **No extra CLI flags** - Configuration is exclusively in `{{output_path_abs}}_metadata.csv`; wrapper provides input paths
 - [ ] **Parameter names match documentation** - Not CLI flag names
 - [ ] **Quote values containing commas** - `key,"value1,value2,value3"`
 
@@ -1394,13 +1394,13 @@ For SDMX datasets, also ensure:
 
 ```bash
 # Run statvar processor using dedicated script
-{{script_dir}}/agentic_import/run_statvar_processor.sh \
+{{script_dir_abs}}/agentic_import/run_statvar_processor.sh \
   --python "{{python_interpreter}}" \
-  --script-dir "{{script_dir}}" \
-  --working-dir "{{working_dir}}" \
-  --input-data "{{input_data}}" \
+  --script-dir "{{script_dir_abs}}" \
+  --working-dir "{{working_dir_abs}}" \
+  --input-data "{{input_data_abs}}" \
   --gemini-run-id "{{gemini_run_id}}" \
-  --output-path "{{output_path}}"
+  --output-path "{{output_path_abs}}"
 ```
 
 The wrapper reads `metadata.csv` for all processor configuration. Do not add extra flags to this command.
@@ -1409,11 +1409,11 @@ The wrapper reads `metadata.csv` for all processor configuration. Do not add ext
 
 **📊 VALIDATION CHECKLIST**:
 - Check the command exit code (0 = success, non-zero = failure)
-- Verify that `{{working_dir}}/{{output_path}}.csv` exists and is not empty
+- Verify that `{{output_path_abs}}.csv` exists and is not empty
 - Confirm no duplicate entries for same place, date, and variable
 - **Verify output.csv contains all required columns**: Must include at minimum `observationAbout`, `observationDate`, `variableMeasured`, `value`
-- **Verify complete column mapping**: Any observation properties mapped in {{output_dir}}/{{output_basename}}_pvmap.csv (like `unit`, `scalingFactor`, `measurementMethod`, `observationPeriod`) must be present as columns in `{{working_dir}}/{{output_path}}.csv`
-- **Verify `{{output_dir}}/{{output_basename}}_metadata.csv` completeness**: Confirm `header_rows` parameter is present and correctly specified
+- **Verify complete column mapping**: Any observation properties mapped in {{output_path_abs}}_pvmap.csv (like `unit`, `scalingFactor`, `measurementMethod`, `observationPeriod`) must be present as columns in `{{output_path_abs}}.csv`
+- **Verify `{{output_path_abs}}_metadata.csv` completeness**: Confirm `header_rows` parameter is present and correctly specified
 
 **🎯 DECISION LOGIC - APPLY THIS EXACTLY**:
 
@@ -1428,22 +1428,22 @@ IF all items in the VALIDATION CHECKLIST above pass:
 ELIF CURRENT_ATTEMPT < {{max_iterations}}:
     → OUTPUT: "❌ ATTEMPT CURRENT_ATTEMPT FAILED - Error details: [describe specific error]"
     → OUTPUT: "🔄 Starting attempt [CURRENT_ATTEMPT + 1] of {{max_iterations}}..."   
-    → Analyze the error from logs. In case statvar processor failed, read log file at: {{working_dir}}/.datacommons/processor.log
+    → Analyze the error from logs. In case statvar processor failed, read log file at: {{working_dir_abs}}/.datacommons/processor.log
     {# TODO: move debugging instructions to separate section #}
-    → **Common {{output_dir}}/{{output_basename}}_metadata.csv issues to check:**
+    → **Common {{output_path_abs}}_metadata.csv issues to check:**
        • Missing or wrong `header_rows` (should be 1 for standard CSV with headers)
        • Wrong `skip_rows` value skipping too much data
        • Debugging parameters left in production (`process_rows`, `input_rows`, `input_columns`)
        • Place resolution issues: missing `places_within` or wrong `place_type`
-    → Modify {{output_dir}}/{{output_basename}}_pvmap.csv and/or {{output_dir}}/{{output_basename}}_metadata.csv to fix identified issues
+    → Modify {{output_path_abs}}_pvmap.csv and/or {{output_path_abs}}_metadata.csv to fix identified issues
     → INCREMENT ATTEMPT COUNTER
     → Return to Step 5 (Run the Processor)
 
 ELSE (CURRENT_ATTEMPT >= {{max_iterations}}):
     → OUTPUT: "⛔ ITERATION LIMIT REACHED: Failed after {{max_iterations}} attempts"
     → OUTPUT: "📋 Final Status: FAILED - Manual intervention required"
-    → OUTPUT: "📁 Check logs at: {{working_dir}}/.datacommons/ for debugging"
-    → OUTPUT: "📁 Check backup at: {{working_dir}}/runs/{{gemini_run_id}}/ for debugging"
+    → OUTPUT: "📁 Check logs at: {{working_dir_abs}}/.datacommons/ for debugging"
+    → OUTPUT: "📁 Check backup at: {{working_dir_abs}}/runs/{{gemini_run_id}}/ for debugging"
     → STOP EXECUTION IMMEDIATELY
     → DO NOT MAKE ANY MORE ATTEMPTS
 ```
@@ -1462,21 +1462,21 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp
 
 ```json
 {
-  "input_data": ["{{input_data}}"],
-  "input_metadata": {{input_metadata | tojson}},
-  "working_dir": "{{working_dir}}",
-  "output_dir": "{{working_dir}}/{{output_dir}}",
+  "input_data": ["{{input_data_abs}}"],
+  "input_metadata": {{input_metadata_abs | tojson}},
+  "working_dir": "{{working_dir_abs}}",
+  "output_dir": "{{output_dir_abs}}",
   "dataset_type": "{{dataset_type}}"
 }
 ```
 
 # OUTPUT REQUIREMENTS & FINAL INSTRUCTION
 
-- Generate `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv`
+- Generate `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv`
 - **Adhere to Rules:** Strictly follow all schema rules, property requirements, and formatting guidelines from the
   knowledge base.
 - DO NOT deviate from the documented standards.
-- Configuration location: Place all processor flags/settings in `{{output_dir}}/{{output_basename}}_metadata.csv` only. Do not embed settings in `{{output_dir}}/{{output_basename}}_pvmap.csv` and do not propose additional CLI flags.
+- Configuration location: Place all processor flags/settings in `{{output_path_abs}}_metadata.csv` only. Do not embed settings in `{{output_path_abs}}_pvmap.csv` and do not propose additional CLI flags.
 
 # 🛑 FINAL EXECUTION REMINDERS
 
@@ -1491,7 +1491,7 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp
 
 # ACTION REQUIRED NOW
 
-**Execute** the data analysis and generate the `{{output_dir}}/{{output_basename}}_pvmap.csv` and `{{output_dir}}/{{output_basename}}_metadata.csv`
+**Execute** the data analysis and generate the `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv`
 files now. Follow the primary workflow **WITHOUT** deviation.
 
 **REMEMBER**: You have {{max_iterations}} attempts maximum. Track each attempt and stop when you succeed or reach the limit.

From a97e212f058dedab7d5ae69a49f00201e4c52ae5 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 07:06:12 +0000
Subject: [PATCH 41/54] Refactor run/dry-run test helpers

---
 .../sdmx_import_pipeline_test.py              | 206 +++++++-----------
 1 file changed, 78 insertions(+), 128 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 73bad6e7ba..8501db8000 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -653,6 +653,34 @@ def test_hash_unchanged_skips_rerun(self) -> None:
 
 class SdmxStepTest(SdmxTestBase):
 
+    def _assert_run_and_dry_run_use_same_plan(self,
+                                              step,
+                                              *,
+                                              log_contains: str,
+                                              cmd_contains: str,
+                                              extra_cmd_checks=None,
+                                              expect_verbose: bool = True
+                                              ) -> None:
+        extra_cmd_checks = extra_cmd_checks or []
+        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+                       ) as mock_run_cmd:
+            with self.assertLogs(logging.get_absl_logger(),
+                                 level="INFO") as logs:
+                step.dry_run()
+                step.run()
+
+        self.assertTrue(
+            any("test-step (dry run): would run" in entry
+                for entry in logs.output))
+        self.assertTrue(any(log_contains in entry for entry in logs.output))
+        mock_run_cmd.assert_called_once()
+        args, kwargs = mock_run_cmd.call_args
+        command = args[0]
+        self.assertTrue(any(cmd_contains in arg for arg in command))
+        self.assertEqual(kwargs["verbose"], expect_verbose)
+        for check in extra_cmd_checks:
+            check(command)
+
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
             with self.assertLogs(logging.get_absl_logger(),
@@ -705,26 +733,11 @@ def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
             ),
         )
         step = DownloadMetadataStep(name="test-step", config=config)
-
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
-                       ) as mock_run_cmd:
-            with self.assertLogs(logging.get_absl_logger(),
-                                 level="INFO") as logs:
-                step.dry_run()
-                step.run()
-
-            # Verify dry_run logged the command
-            self.assertTrue(
-                any("test-step (dry run): would run" in entry
-                    for entry in logs.output))
-            self.assertTrue(
-                any("download-metadata" in entry for entry in logs.output))
-
-            # Verify run called the command with the same args
-            mock_run_cmd.assert_called_once()
-            args, kwargs = mock_run_cmd.call_args
-            self.assertIn("download-metadata", args[0])
-            self.assertTrue(kwargs["verbose"])
+        self._assert_run_and_dry_run_use_same_plan(
+            step,
+            log_contains="download-metadata",
+            cmd_contains="download-metadata",
+        )
 
     def test_download_data_step_caches_plan(self) -> None:
         config = PipelineConfig(
@@ -773,26 +786,11 @@ def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
             ),
         )
         step = DownloadDataStep(name="test-step", config=config)
-
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
-                       ) as mock_run_cmd:
-            with self.assertLogs(logging.get_absl_logger(),
-                                 level="INFO") as logs:
-                step.dry_run()
-                step.run()
-
-            # Verify dry_run logged the command
-            self.assertTrue(
-                any("test-step (dry run): would run" in entry
-                    for entry in logs.output))
-            self.assertTrue(
-                any("download-data" in entry for entry in logs.output))
-
-            # Verify run called the command with the same args
-            mock_run_cmd.assert_called_once()
-            args, kwargs = mock_run_cmd.call_args
-            self.assertIn("download-data", args[0])
-            self.assertTrue(kwargs["verbose"])
+        self._assert_run_and_dry_run_use_same_plan(
+            step,
+            log_contains="download-data",
+            cmd_contains="download-data",
+        )
 
     def test_create_sample_step_caches_plan(self) -> None:
         config = PipelineConfig(
@@ -832,28 +830,11 @@ def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
         # Create test input file for run()
         input_path = Path(self._tmpdir) / "demo_data.csv"
         input_path.write_text("header\nrow1")
-
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
-                       ) as mock_run_cmd:
-            with self.assertLogs(logging.get_absl_logger(),
-                                 level="INFO") as logs:
-                step.dry_run()
-                step.run()
-
-            # Verify dry_run logged the command
-            self.assertTrue(
-                any("test-step (dry run): would run" in entry
-                    for entry in logs.output))
-            self.assertTrue(
-                any("data_sampler.py" in entry for entry in logs.output))
-
-            # Verify run called the command with the same args
-            mock_run_cmd.assert_called_once()
-            args, kwargs = mock_run_cmd.call_args
-            self.assertTrue(any("data_sampler.py" in arg for arg in args[0]))
-            self.assertTrue(kwargs["verbose"])
-
-            self.assertTrue(kwargs["verbose"])
+        self._assert_run_and_dry_run_use_same_plan(
+            step,
+            log_contains="data_sampler.py",
+            cmd_contains="data_sampler.py",
+        )
 
     def test_create_sample_step_dry_run_succeeds_if_input_missing(self) -> None:
         config = PipelineConfig(
@@ -924,28 +905,11 @@ def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
         # Create test input files for run()
         (Path(self._tmpdir) / "demo_sample.csv").write_text("header\nrow1")
         (Path(self._tmpdir) / "demo_metadata.xml").write_text("<xml/>")
-
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
-                       ) as mock_run_cmd:
-            with self.assertLogs(logging.get_absl_logger(),
-                                 level="INFO") as logs:
-                step.dry_run()
-                step.run()
-
-            # Verify dry_run logged the command
-            self.assertTrue(
-                any("test-step (dry run): would run" in entry
-                    for entry in logs.output))
-            self.assertTrue(
-                any("pvmap_generator.py" in entry for entry in logs.output))
-
-            # Verify run called the command with the same args
-            mock_run_cmd.assert_called_once()
-            args, kwargs = mock_run_cmd.call_args
-            self.assertTrue(any("pvmap_generator.py" in arg for arg in args[0]))
-            self.assertTrue(kwargs["verbose"])
-
-            self.assertTrue(kwargs["verbose"])
+        self._assert_run_and_dry_run_use_same_plan(
+            step,
+            log_contains="pvmap_generator.py",
+            cmd_contains="pvmap_generator.py",
+        )
 
     def test_create_schema_map_step_dry_run_succeeds_if_input_missing(
             self) -> None:
@@ -1001,29 +965,15 @@ def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None:
 
         # Create test files
         self._create_test_input_files("demo")
-
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
-                       ) as mock_run_cmd:
-            with self.assertLogs(logging.get_absl_logger(),
-                                 level="INFO") as logs:
-                step.dry_run()
-                step.run()
-
-            # Verify dry_run logged the command
-            self.assertTrue(
-                any("test-step (dry run): would run" in entry
-                    for entry in logs.output))
-            self.assertTrue(
-                any("stat_var_processor.py" in entry for entry in logs.output))
-
-            # Verify run called the command with the same args
-            mock_run_cmd.assert_called_once()
-            args, kwargs = mock_run_cmd.call_args
-            self.assertTrue(
-                any("stat_var_processor.py" in arg for arg in args[0]))
-            self.assertTrue(
-                any(arg.startswith("--input_data=") for arg in args[0]))
-            self.assertTrue(kwargs["verbose"])
+        self._assert_run_and_dry_run_use_same_plan(
+            step,
+            log_contains="stat_var_processor.py",
+            cmd_contains="stat_var_processor.py",
+            extra_cmd_checks=[
+                lambda command: self.assertTrue(
+                    any(arg.startswith("--input_data=") for arg in command)),
+            ],
+        )
 
     def test_process_full_data_step_dry_run_succeeds_if_input_missing(
             self) -> None:
@@ -1074,27 +1024,27 @@ def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None:
         final_output_dir = Path(self._tmpdir) / "output"
         final_output_dir.mkdir(parents=True, exist_ok=True)
         (final_output_dir / "demo.csv").write_text("data")
-
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
-                       ) as mock_run_cmd:
-            step.run()
-            mock_run_cmd.assert_called_once()
-            args, kwargs = mock_run_cmd.call_args
-            command = args[0]
-            self.assertTrue(
-                any("generate_custom_dc_config.py" in arg for arg in command))
-            self.assertIn(f"--input_csv={final_output_dir}/demo.csv", command)
-            self.assertIn(
-                f"--output_config={final_output_dir}/demo_config.json", command)
-            self.assertIn("--provenance_name=FLOW", command)
-            self.assertIn("--source_name=AGENCY", command)
-            self.assertIn("--data_source_url=https://example.com", command)
-            self.assertIn("--dataset_url=https://example.com/data/AGENCY,FLOW,",
-                          command)
-
-        with self.assertLogs(logging.get_absl_logger(), level="INFO") as cm:
-            step.dry_run()
-            self.assertTrue(any("would run" in msg for msg in cm.output))
+        self._assert_run_and_dry_run_use_same_plan(
+            step,
+            log_contains="generate_custom_dc_config.py",
+            cmd_contains="generate_custom_dc_config.py",
+            extra_cmd_checks=[
+                lambda command: self.assertIn(
+                    f"--input_csv={final_output_dir}/demo.csv", command),
+                lambda command: self.assertIn(
+                    f"--output_config={final_output_dir}/demo_config.json",
+                    command),
+                lambda command: self.assertIn("--provenance_name=FLOW",
+                                              command),
+                lambda command: self.assertIn("--source_name=AGENCY", command),
+                lambda command: self.assertIn(
+                    "--data_source_url=https://example.com", command),
+                lambda command: self.assertIn(
+                    "--dataset_url=https://example.com/data/AGENCY,FLOW,",
+                    command),
+            ],
+            expect_verbose=False,
+        )
 
 
 if __name__ == "__main__":

From b98bbac8c60ebda99c124407160d1123ffc34350 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 07:09:48 +0000
Subject: [PATCH 42/54] Refactor cache plan tests

---
 .../sdmx_import_pipeline_test.py              | 126 +++++++++---------
 1 file changed, 62 insertions(+), 64 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 8501db8000..5b070886b2 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -681,6 +681,26 @@ def _assert_run_and_dry_run_use_same_plan(self,
         for check in extra_cmd_checks:
             check(command)
 
+    def _assert_step_caches_plan(self,
+                                 step,
+                                 *,
+                                 command_contains=None,
+                                 path_attrs=None) -> None:
+        command_contains = command_contains or []
+        path_attrs = path_attrs or []
+
+        context1 = step._prepare_command()
+        context2 = step._prepare_command()
+        self.assertIs(context1, context2)
+
+        for attr in path_attrs:
+            self.assertTrue(getattr(context1, attr).is_absolute())
+
+        if command_contains:
+            for expected in command_contains:
+                self.assertTrue(
+                    any(expected in arg for arg in context1.full_command))
+
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
             with self.assertLogs(logging.get_absl_logger(),
@@ -707,16 +727,11 @@ def test_download_metadata_step_caches_plan(self) -> None:
             ),
         )
         step = DownloadMetadataStep(name="test-step", config=config)
-
-        # First call creates context
-        context1 = step._prepare_command()
-        self.assertIn("download-metadata", context1.full_command)
-        self.assertIn("--endpoint=https://example.com", context1.full_command)
-
-        # Second call returns same object
-        context2 = step._prepare_command()
-        self.assertIs(context1, context2)
-        self.assertTrue(context1.output_path.is_absolute())
+        self._assert_step_caches_plan(
+            step,
+            command_contains=["download-metadata", "--endpoint=https://example.com"],
+            path_attrs=["output_path"],
+        )
 
     def test_download_metadata_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -758,18 +773,16 @@ def test_download_data_step_caches_plan(self) -> None:
             ),
         )
         step = DownloadDataStep(name="test-step", config=config)
-
-        # First call creates context
-        context1 = step._prepare_command()
-        self.assertIn("download-data", context1.full_command)
-        self.assertIn("--endpoint=https://example.com", context1.full_command)
-        self.assertIn("--key=test-key", context1.full_command)
-        self.assertIn("--param=area=US", context1.full_command)
-
-        # Second call returns same object
-        context2 = step._prepare_command()
-        self.assertIs(context1, context2)
-        self.assertTrue(context1.output_path.is_absolute())
+        self._assert_step_caches_plan(
+            step,
+            command_contains=[
+                "download-data",
+                "--endpoint=https://example.com",
+                "--key=test-key",
+                "--param=area=US",
+            ],
+            path_attrs=["output_path"],
+        )
 
     def test_download_data_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -803,17 +816,11 @@ def test_create_sample_step_caches_plan(self) -> None:
             sample=SampleConfig(rows=500),
         )
         step = CreateSampleStep(name="test-step", config=config)
-
-        # No input file created, dry run should still succeed
-        context1 = step._prepare_command()
-        self.assertTrue(
-            any("data_sampler.py" in arg for arg in context1.full_command))
-        self.assertIn("--sampler_output_rows=500", context1.full_command)
-
-        # Second call returns same object
-        context2 = step._prepare_command()
-        self.assertIs(context1, context2)
-        self.assertTrue(context1.output_path.is_absolute())
+        self._assert_step_caches_plan(
+            step,
+            command_contains=["data_sampler.py", "--sampler_output_rows=500"],
+            path_attrs=["output_path"],
+        )
 
     def test_create_sample_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(
@@ -876,22 +883,15 @@ def test_create_schema_map_step_caches_plan(self) -> None:
             skip_confirmation=True,
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
-
-        # No input files created, dry run should still succeed
-
-        # First call creates context
-        context1 = step._prepare_command()
-        self.assertTrue(
-            any("pvmap_generator.py" in arg for arg in context1.full_command))
-        self.assertIn("--gemini_cli=custom-gemini", context1.full_command)
-        self.assertIn("--skip_confirmation", context1.full_command)
-
-        # Second call returns same object
-        context2 = step._prepare_command()
-        self.assertIs(context1, context2)
-        self.assertTrue(context1.sample_path.is_absolute())
-        self.assertTrue(context1.metadata_path.is_absolute())
-        self.assertTrue(context1.output_prefix.is_absolute())
+        self._assert_step_caches_plan(
+            step,
+            command_contains=[
+                "pvmap_generator.py",
+                "--gemini_cli=custom-gemini",
+                "--skip_confirmation",
+            ],
+            path_attrs=["sample_path", "metadata_path", "output_prefix"],
+        )
 
     def test_create_schema_map_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -943,16 +943,15 @@ def test_process_full_data_step_caches_plan(self) -> None:
             verbose=True,
         ),)
         step = ProcessFullDataStep(name="test-step", config=config)
-
-        # No input files created, dry run should still succeed
-
-        context1 = step._prepare_command()
-        context2 = step._prepare_command()
-        self.assertIs(context1, context2)
-        self.assertTrue(context1.input_data_path.is_absolute())
-        self.assertTrue(context1.pv_map_path.is_absolute())
-        self.assertTrue(context1.metadata_path.is_absolute())
-        self.assertTrue(context1.output_prefix.is_absolute())
+        self._assert_step_caches_plan(
+            step,
+            path_attrs=[
+                "input_data_path",
+                "pv_map_path",
+                "metadata_path",
+                "output_prefix",
+            ],
+        )
 
     def test_process_full_data_step_run_and_dry_run_use_same_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -1005,11 +1004,10 @@ def test_create_dc_config_step_caches_plan(self) -> None:
                                     agency="AGENCY",
                                     dataflow="FLOW")
         step = CreateDcConfigStep(name="test-step", config=config)
-        context1 = step._prepare_command()
-        context2 = step._prepare_command()
-        self.assertIs(context1, context2)
-        self.assertTrue(context1.input_csv.is_absolute())
-        self.assertTrue(context1.output_config.is_absolute())
+        self._assert_step_caches_plan(
+            step,
+            path_attrs=["input_csv", "output_config"],
+        )
 
     def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None:
         config = self._build_config(dataset_prefix="demo",

From c178bfa077688cd885e40cafb765b255490838bf Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 07:12:09 +0000
Subject: [PATCH 43/54] Refactor missing-input test helpers

---
 .../sdmx_import_pipeline_test.py              | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 5b070886b2..4a20fbb37b 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -701,6 +701,13 @@ def _assert_step_caches_plan(self,
                 self.assertTrue(
                     any(expected in arg for arg in context1.full_command))
 
+    def _assert_dry_run_succeeds_without_input(self, step) -> None:
+        step.dry_run()
+
+    def _assert_run_fails_without_input(self, step, error_pattern: str) -> None:
+        with self.assertRaisesRegex(RuntimeError, error_pattern):
+            step.run()
+
     def test_run_command_logs_and_executes(self) -> None:
         with mock.patch("subprocess.run") as mock_run:
             with self.assertLogs(logging.get_absl_logger(),
@@ -855,7 +862,7 @@ def test_create_sample_step_dry_run_succeeds_if_input_missing(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
         # No input file created, dry run should still succeed
-        step.dry_run()
+        self._assert_dry_run_succeeds_without_input(step)
 
     def test_create_sample_step_run_fails_if_input_missing(self) -> None:
         config = PipelineConfig(
@@ -869,9 +876,8 @@ def test_create_sample_step_run_fails_if_input_missing(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
         # No input file created, run should fail
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Input file missing for sampling"):
-            step.run()
+        self._assert_run_fails_without_input(
+            step, "Input file missing for sampling")
 
     def test_create_schema_map_step_caches_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -921,7 +927,7 @@ def test_create_schema_map_step_dry_run_succeeds_if_input_missing(
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
         # No input files created, dry run should still succeed
-        step.dry_run()
+        self._assert_dry_run_succeeds_without_input(step)
 
     def test_create_schema_map_step_run_fails_if_input_missing(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -932,8 +938,7 @@ def test_create_schema_map_step_run_fails_if_input_missing(self) -> None:
         ),)
         step = CreateSchemaMapStep(name="test-step", config=config)
         # No input files created, run should fail
-        with self.assertRaises(RuntimeError):
-            step.run()
+        self._assert_run_fails_without_input(step, ".*")
 
     def test_process_full_data_step_caches_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -984,7 +989,7 @@ def test_process_full_data_step_dry_run_succeeds_if_input_missing(
         ),)
         step = ProcessFullDataStep(name="test-step", config=config)
         # Missing input files, dry run should still succeed
-        step.dry_run()
+        self._assert_dry_run_succeeds_without_input(step)
 
     def test_process_full_data_step_run_fails_if_input_missing(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -995,8 +1000,7 @@ def test_process_full_data_step_run_fails_if_input_missing(self) -> None:
         ),)
         step = ProcessFullDataStep(name="test-step", config=config)
         # Missing input files, run should fail
-        with self.assertRaises(RuntimeError):
-            step.run()
+        self._assert_run_fails_without_input(step, ".*")
 
     def test_create_dc_config_step_caches_plan(self) -> None:
         config = self._build_config(dataset_prefix="demo",

From ef361a9f36efc86c7b436408e51a1512b5374842 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 07:14:10 +0000
Subject: [PATCH 44/54] lint fix

---
 .../sdmx_import_pipeline_test.py              | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 4a20fbb37b..0340b8b227 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -653,14 +653,14 @@ def test_hash_unchanged_skips_rerun(self) -> None:
 
 class SdmxStepTest(SdmxTestBase):
 
-    def _assert_run_and_dry_run_use_same_plan(self,
-                                              step,
-                                              *,
-                                              log_contains: str,
-                                              cmd_contains: str,
-                                              extra_cmd_checks=None,
-                                              expect_verbose: bool = True
-                                              ) -> None:
+    def _assert_run_and_dry_run_use_same_plan(
+            self,
+            step,
+            *,
+            log_contains: str,
+            cmd_contains: str,
+            extra_cmd_checks=None,
+            expect_verbose: bool = True) -> None:
         extra_cmd_checks = extra_cmd_checks or []
         with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
                        ) as mock_run_cmd:
@@ -736,7 +736,9 @@ def test_download_metadata_step_caches_plan(self) -> None:
         step = DownloadMetadataStep(name="test-step", config=config)
         self._assert_step_caches_plan(
             step,
-            command_contains=["download-metadata", "--endpoint=https://example.com"],
+            command_contains=[
+                "download-metadata", "--endpoint=https://example.com"
+            ],
             path_attrs=["output_path"],
         )
 
@@ -876,8 +878,8 @@ def test_create_sample_step_run_fails_if_input_missing(self) -> None:
         )
         step = CreateSampleStep(name="test-step", config=config)
         # No input file created, run should fail
-        self._assert_run_fails_without_input(
-            step, "Input file missing for sampling")
+        self._assert_run_fails_without_input(step,
+                                             "Input file missing for sampling")
 
     def test_create_schema_map_step_caches_plan(self) -> None:
         config = PipelineConfig(run=RunConfig(
@@ -1036,8 +1038,8 @@ def test_create_dc_config_step_run_and_dry_run_use_same_plan(self) -> None:
                 lambda command: self.assertIn(
                     f"--output_config={final_output_dir}/demo_config.json",
                     command),
-                lambda command: self.assertIn("--provenance_name=FLOW",
-                                              command),
+                lambda command: self.assertIn("--provenance_name=FLOW", command
+                                             ),
                 lambda command: self.assertIn("--source_name=AGENCY", command),
                 lambda command: self.assertIn(
                     "--data_source_url=https://example.com", command),

From 6a93ffcd16c111f3e9795f14eaae8b95dd741641 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 07:17:31 +0000
Subject: [PATCH 45/54] Fix absolute output path handling

---
 tools/agentic_import/run_statvar_processor.sh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tools/agentic_import/run_statvar_processor.sh b/tools/agentic_import/run_statvar_processor.sh
index d907e4f724..2c2cd83fca 100755
--- a/tools/agentic_import/run_statvar_processor.sh
+++ b/tools/agentic_import/run_statvar_processor.sh
@@ -67,6 +67,13 @@ if [[ -z "$PYTHON_INTERPRETER" || -z "$SCRIPT_DIR" || -z "$WORKING_DIR" || -z "$
   exit 1
 fi
 
+# Normalize output prefix: respect absolute paths, otherwise anchor under working dir.
+if [[ "${OUTPUT_PATH}" = /* ]]; then
+  OUTPUT_PREFIX="${OUTPUT_PATH}"
+else
+  OUTPUT_PREFIX="${WORKING_DIR}/${OUTPUT_PATH}"
+fi
+
 # Create .datacommons directory if it doesn't exist
 mkdir -p "${WORKING_DIR}/.datacommons"
 
@@ -81,20 +88,19 @@ OUTPUT_COLUMNS="observationDate,observationAbout,variableMeasured,value,observat
 echo "Running statvar processor..."
 "${PYTHON_INTERPRETER}" "${SCRIPT_DIR}/statvar_importer/stat_var_processor.py" \
   --input_data="${INPUT_DATA}" \
-  --pv_map="${WORKING_DIR}/${OUTPUT_PATH}_pvmap.csv" \
-  --config_file="${WORKING_DIR}/${OUTPUT_PATH}_metadata.csv" \
+  --pv_map="${OUTPUT_PREFIX}_pvmap.csv" \
+  --config_file="${OUTPUT_PREFIX}_metadata.csv" \
   --generate_statvar_name=True \
   --skip_constant_csv_columns=False \
   --output_columns="${OUTPUT_COLUMNS}" \
   --output_counters="${WORKING_DIR}/.datacommons/output_counters.csv" \
-  --output_path="${WORKING_DIR}/${OUTPUT_PATH}" > "${PROCESSOR_LOG}" 2>&1
+  --output_path="${OUTPUT_PREFIX}" > "${PROCESSOR_LOG}" 2>&1
 
 # Capture the processor exit code
 PROCESSOR_EXIT_CODE=${PIPESTATUS[0]}
 
 # Run backup script silently (redirect output to backup log)
 echo "Backing up run data..."
-OUTPUT_PREFIX="${WORKING_DIR}/${OUTPUT_PATH}"
 declare -a BACKUP_ARGS=(
   "--working_dir=${WORKING_DIR}"
   "--gemini_run_id=${GEMINI_RUN_ID}"

From fc8d555af1b906505cc8c116399d3bc6e3840510 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 07:21:58 +0000
Subject: [PATCH 46/54] Add abs path backup test

---
 tools/agentic_import/backup_processor_run_test.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/agentic_import/backup_processor_run_test.py b/tools/agentic_import/backup_processor_run_test.py
index 3681b3e7ae..027f1f81f3 100644
--- a/tools/agentic_import/backup_processor_run_test.py
+++ b/tools/agentic_import/backup_processor_run_test.py
@@ -45,6 +45,17 @@ def _read_manifest(self, backup_dir: Path) -> str:
         with open(manifest_path, 'r') as manifest_file:
             return manifest_file.read()
 
+    def test_absolute_path_copied(self):
+        absolute_file = self.working_dir / 'abs.txt'
+        absolute_file.write_text('absolute')
+
+        backup_dir = self._run_backup([str(absolute_file)])
+
+        self.assertTrue((backup_dir / 'abs.txt').exists())
+        manifest = self._read_manifest(backup_dir)
+        self.assertIn(str(absolute_file), manifest)
+        self.assertNotIn('Skipped (missing or blocked):', manifest)
+
     def test_copies_requested_files(self):
         first = self.working_dir / 'a.txt'
         second = self.working_dir / 'b.txt'

From 7b33667c82fb20b7d16c29117cabd28613ac1d2c Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 08:27:12 +0000
Subject: [PATCH 47/54] minor fix in tests

---
 tools/agentic_import/state_handler_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/agentic_import/state_handler_test.py b/tools/agentic_import/state_handler_test.py
index c000260f6d..dfcdd416dc 100644
--- a/tools/agentic_import/state_handler_test.py
+++ b/tools/agentic_import/state_handler_test.py
@@ -47,7 +47,7 @@ def test_missing_file_creates_empty_state(self) -> None:
                 data = json.load(fp)
             self.assertEqual(data["dataset_prefix"], "demo")
             self.assertEqual(data["steps"], {})
-            self.assertIsNone(data["updated_at_ts"])
+            self.assertIsNone(data.get("updated_at_ts"))
 
     def test_corrupt_file_creates_backup_and_resets_state(self) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:

From cbec5c1e1c9cb50ad55a444324cb9a6a8fb49de8 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 08:33:34 +0000
Subject: [PATCH 48/54] refactor

---
 tools/agentic_import/sdmx_import_pipeline.py | 86 ++++++++++----------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 3c933f5d5a..99ff2cf000 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -64,6 +64,49 @@
 FLAGS = flags.FLAGS
 
 
+def _define_flags() -> None:
+    flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
+    flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT)
+
+    flags.DEFINE_string(_FLAG_SDMX_AGENCY, None,
+                        "Owning SDMX agency identifier.")
+    flags.mark_flag_as_required(_FLAG_SDMX_AGENCY)
+
+    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None,
+                        "Target SDMX dataflow identifier.")
+    flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID)
+
+    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None,
+                        "Optional SDMX key or filter.")
+
+    flags.DEFINE_string(
+        _FLAG_SDMX_DATAFLOW_PARAM, None,
+        "Optional SDMX parameter appended to the dataflow query.")
+
+    flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000,
+                         "Number of rows to sample from downloaded data.")
+
+    flags.DEFINE_string(
+        "dataset_prefix", None,
+        "Optional dataset prefix to override auto-derived values.")
+
+    flags.DEFINE_string("run_only", None,
+                        "Execute only a specific pipeline step by name.")
+
+    flags.DEFINE_boolean("force", False, "Force all steps to run.")
+
+    flags.DEFINE_boolean("verbose", False, "Enable verbose logging.")
+
+    flags.DEFINE_boolean("skip_confirmation", False,
+                         "Skip interactive confirmation prompts.")
+
+    flags.DEFINE_string("gemini_cli", "gemini",
+                        "Path to Gemini CLI executable.")
+
+    flags.DEFINE_string("working_dir", None,
+                        "Working directory for the pipeline.")
+
+
 @dataclass(frozen=True)
 class SdmxDataflowConfig:
     """Configuration for SDMX dataflow."""
@@ -949,49 +992,6 @@ def prepare_config() -> PipelineConfig:
     )
 
 
-def _define_flags() -> None:
-    flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
-    flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT)
-
-    flags.DEFINE_string(_FLAG_SDMX_AGENCY, None,
-                        "Owning SDMX agency identifier.")
-    flags.mark_flag_as_required(_FLAG_SDMX_AGENCY)
-
-    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None,
-                        "Target SDMX dataflow identifier.")
-    flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID)
-
-    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None,
-                        "Optional SDMX key or filter.")
-
-    flags.DEFINE_string(
-        _FLAG_SDMX_DATAFLOW_PARAM, None,
-        "Optional SDMX parameter appended to the dataflow query.")
-
-    flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000,
-                         "Number of rows to sample from downloaded data.")
-
-    flags.DEFINE_string(
-        "dataset_prefix", None,
-        "Optional dataset prefix to override auto-derived values.")
-
-    flags.DEFINE_string("run_only", None,
-                        "Execute only a specific pipeline step by name.")
-
-    flags.DEFINE_boolean("force", False, "Force all steps to run.")
-
-    flags.DEFINE_boolean("verbose", False, "Enable verbose logging.")
-
-    flags.DEFINE_boolean("skip_confirmation", False,
-                         "Skip interactive confirmation prompts.")
-
-    flags.DEFINE_string("gemini_cli", "gemini",
-                        "Path to Gemini CLI executable.")
-
-    flags.DEFINE_string("working_dir", None,
-                        "Working directory for the pipeline.")
-
-
 def main(_: list[str]) -> int:
     config = prepare_config()
     logging.info(f"SDMX pipeline configuration: {config}")

From 1f74094babce263c52085a1aaaa0c2deda10da61 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 08:48:01 +0000
Subject: [PATCH 49/54] refactor: Remove `get_steps` method and directly access
 `pipeline.steps` attribute.

---
 tools/agentic_import/pipeline.py                  | 5 +----
 tools/agentic_import/sdmx_import_pipeline_test.py | 6 +++---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index 09b22a94dd..43d009c3b6 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -67,9 +67,6 @@ def version(self) -> int:
 class Pipeline:
     steps: Sequence[Step]
 
-    def get_steps(self) -> list[Step]:
-        return list(self.steps)
-
 
 class PipelineCallback:
     """Lifecycle hooks consumed by the runner; defaults are no-ops."""
@@ -112,7 +109,7 @@ def run(self,
             pipeline: Pipeline,
             callback: PipelineCallback | None = None) -> None:
         current_step: Step | None = None
-        steps = pipeline.get_steps()
+        steps = pipeline.steps
         logging.info(f"Starting pipeline with {len(steps)} steps")
         try:
             for step in steps:
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 0340b8b227..6f971faecb 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -341,7 +341,7 @@ def _names_from_builder(self,
                                   steps=builder_steps)
         result = builder.build()
         pipeline = result.pipeline
-        return [step.name for step in pipeline.get_steps()]
+        return [step.name for step in pipeline.steps]
 
     def test_run_only_step(self) -> None:
         cfg_step = PipelineConfig(
@@ -432,7 +432,7 @@ def test_version_bump_schedules_downstream(self) -> None:
         self.assertEqual(names, ["process-full-data", "create-dc-config"])
 
         pipeline = build_sdmx_pipeline(config=cfg, state=state, steps=steps)
-        self.assertEqual([s.name for s in pipeline.get_steps()],
+        self.assertEqual([s.name for s in pipeline.steps],
                          ["process-full-data", "create-dc-config"])
 
     def test_incremental_records_skip_reasons(self) -> None:
@@ -448,7 +448,7 @@ def test_incremental_records_skip_reasons(self) -> None:
         steps = build_steps(cfg)
         builder = PipelineBuilder(config=cfg, state=state, steps=steps)
         result = builder.build()
-        self.assertFalse(result.pipeline.get_steps())
+        self.assertFalse(result.pipeline.steps)
         self.assertEqual(len(result.decisions), len(steps))
         for decision in result.decisions:
             self.assertEqual(decision.decision, StepDecision.SKIP)

From 31c00f86a9f223f2b9a38aeecb4757e587c43bb3 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 09:00:07 +0000
Subject: [PATCH 50/54] feat: Introduce structured SDMX agentic import pipeline
 with dedicated configuration and step implementations.

---
 tools/agentic_import/sdmx_import_pipeline.py  | 696 +-----------------
 .../sdmx_import_pipeline_test.py              |  18 +-
 tools/agentic_import/sdmx_pipeline_builder.py | 231 ++++++
 tools/agentic_import/sdmx_pipeline_config.py  |  61 ++
 tools/agentic_import/sdmx_pipeline_steps.py   | 467 ++++++++++++
 5 files changed, 781 insertions(+), 692 deletions(-)
 create mode 100644 tools/agentic_import/sdmx_pipeline_builder.py
 create mode 100644 tools/agentic_import/sdmx_pipeline_config.py
 create mode 100644 tools/agentic_import/sdmx_pipeline_steps.py

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index 99ff2cf000..a26e6799f0 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -15,20 +15,17 @@
 
 from __future__ import annotations
 
-import abc
 import copy
 import hashlib
 import json
 import os
 import re
 import shlex
-import subprocess
 import sys
 import dataclasses
-from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Callable, ClassVar, Sequence
+from typing import Callable, Sequence
 
 from absl import app, flags, logging
 
@@ -36,22 +33,16 @@
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 
-from tools.agentic_import.pipeline import (CompositeCallback, Pipeline,
-                                           PipelineAbort, PipelineCallback,
-                                           PipelineRunner, RunnerConfig, Step)
-from tools.agentic_import.state_handler import (PipelineState, StateHandler,
-                                                StepState)
-
-SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
-DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py"
-STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" /
-                           "stat_var_processor.py")
-PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py"
-DC_CONFIG_GENERATOR_PATH = (REPO_ROOT / "tools" / "agentic_import" /
-                            "generate_custom_dc_config.py")
-
-SAMPLE_OUTPUT_DIR = Path("sample_output")
-FINAL_OUTPUT_DIR = Path("output")
+from tools.agentic_import.pipeline import (CompositeCallback, PipelineAbort,
+                                           PipelineCallback, PipelineRunner,
+                                           RunnerConfig, Step)
+from tools.agentic_import.sdmx_pipeline_builder import build_sdmx_pipeline
+from tools.agentic_import.sdmx_pipeline_config import (PipelineConfig,
+                                                       RunConfig, SampleConfig,
+                                                       SdmxConfig,
+                                                       SdmxDataflowConfig)
+from tools.agentic_import.sdmx_pipeline_steps import SdmxStep
+from tools.agentic_import.state_handler import StateHandler, StepState
 
 # Flag names
 _FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
@@ -107,86 +98,6 @@ def _define_flags() -> None:
                         "Working directory for the pipeline.")
 
 
-@dataclass(frozen=True)
-class SdmxDataflowConfig:
-    """Configuration for SDMX dataflow."""
-    id: str | None = None
-    key: str | None = None
-    param: str | None = None
-
-
-@dataclass(frozen=True)
-class SdmxConfig:
-    """Configuration for SDMX data access."""
-    endpoint: str | None = None
-    agency: str | None = None
-    dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig)
-
-
-@dataclass(frozen=True)
-class SampleConfig:
-    """Configuration for data sampling."""
-    rows: int = 1000
-
-
-@dataclass(frozen=True)
-class RunConfig:
-    """Configuration for pipeline execution."""
-    command: str
-    dataset_prefix: str | None = None
-    working_dir: str | None = None
-    run_only: str | None = None
-    force: bool = False
-    verbose: bool = False
-    skip_confirmation: bool = False
-    gemini_cli: str | None = None
-
-
-@dataclass(frozen=True)
-class PipelineConfig:
-    """Aggregated configuration for the pipeline."""
-    sdmx: SdmxConfig = field(default_factory=SdmxConfig)
-    sample: SampleConfig = field(default_factory=SampleConfig)
-    run: RunConfig = field(default_factory=lambda: RunConfig(command="python"))
-
-
-@dataclass(frozen=True)
-class StepDecision:
-    """Represents whether a step will run and why."""
-
-    RUN: ClassVar[str] = "RUN"
-    SKIP: ClassVar[str] = "SKIP"
-
-    step_name: str
-    decision: str
-    reason: str
-
-
-@dataclass(frozen=True)
-class BuildResult:
-    """Output of planning that includes the pipeline and per-step decisions."""
-
-    pipeline: Pipeline
-    decisions: list[StepDecision]
-
-
-def _require_config_field(value: str | None, field: str, step_name: str) -> str:
-    if value:
-        return value
-    raise ValueError(f"{step_name} requires config.{field}")
-
-
-def _run_command(command: Sequence[str], *, verbose: bool) -> None:
-    if verbose:
-        logging.debug(f"Running command: {' '.join(command)}")
-    subprocess.run(command, check=True)
-
-
-def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
-    command = [sys.executable, str(SDMX_CLI_PATH), *args]
-    _run_command(command, verbose=verbose)
-
-
 def _format_time(value: datetime) -> str:
     if value.tzinfo is None:
         value = value.replace(tzinfo=timezone.utc)
@@ -343,591 +254,6 @@ def build_pipeline_callback(
     return CompositeCallback([interactive, json_callback])
 
 
-class SdmxStep(Step):
-    """Base class for SDMX steps that carries immutable config and version."""
-
-    def __init__(self, *, name: str, version: int,
-                 config: PipelineConfig) -> None:
-        if not name:
-            raise ValueError("step requires a name")
-        self._name = name
-        self._version = version
-        self._config = config
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @property
-    def version(self) -> int:
-        return self._version
-
-    @abc.abstractmethod
-    def dry_run(self) -> None:
-        """Log a read-only preview of the work to be done."""
-
-
-class DownloadDataStep(SdmxStep):
-    """Downloads SDMX data payloads."""
-
-    VERSION = 1
-
-    @dataclass(frozen=True)
-    class _StepContext:
-        full_command: list[str]
-        output_path: Path
-
-    def __init__(self, *, name: str, config: PipelineConfig) -> None:
-        super().__init__(name=name, version=self.VERSION, config=config)
-        self._context: DownloadDataStep._StepContext | None = None
-
-    def _prepare_command(self) -> _StepContext:
-        if self._context:
-            return self._context
-        endpoint = _require_config_field(self._config.sdmx.endpoint,
-                                         _FLAG_SDMX_ENDPOINT, self.name)
-        agency = _require_config_field(self._config.sdmx.agency,
-                                       _FLAG_SDMX_AGENCY, self.name)
-        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
-                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
-        dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir).resolve()
-        output_path = working_dir / f"{dataset_prefix}_data.csv"
-        args = [
-            "download-data",
-            f"--endpoint={endpoint}",
-            f"--agency={agency}",
-            f"--dataflow={dataflow}",
-            f"--output_path={output_path}",
-        ]
-        if self._config.sdmx.dataflow.key:
-            args.append(f"--key={self._config.sdmx.dataflow.key}")
-        if self._config.sdmx.dataflow.param:
-            args.append(f"--param={self._config.sdmx.dataflow.param}")
-        if self._config.run.verbose:
-            args.append("--verbose")
-        full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
-        self._context = DownloadDataStep._StepContext(full_command=full_command,
-                                                      output_path=output_path)
-        return self._context
-
-    def run(self) -> None:
-        context = self._prepare_command()
-        if self._config.run.verbose:
-            logging.info(
-                f"Starting SDMX data download: {' '.join(context.full_command)} -> {context.output_path}"
-            )
-        else:
-            logging.info(f"Downloading SDMX data to {context.output_path}")
-        _run_command(context.full_command, verbose=self._config.run.verbose)
-
-    def dry_run(self) -> None:
-        context = self._prepare_command()
-        logging.info(
-            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
-        )
-
-
-class DownloadMetadataStep(SdmxStep):
-    """Downloads SDMX metadata payloads."""
-
-    VERSION = 1
-
-    @dataclass(frozen=True)
-    class _StepContext:
-        full_command: list[str]
-        output_path: Path
-
-    def __init__(self, *, name: str, config: PipelineConfig) -> None:
-        super().__init__(name=name, version=self.VERSION, config=config)
-        self._context: DownloadMetadataStep._StepContext | None = None
-
-    def _prepare_command(self) -> _StepContext:
-        if self._context:
-            return self._context
-        endpoint = _require_config_field(self._config.sdmx.endpoint,
-                                         _FLAG_SDMX_ENDPOINT, self.name)
-        agency = _require_config_field(self._config.sdmx.agency,
-                                       _FLAG_SDMX_AGENCY, self.name)
-        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
-                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
-        dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir).resolve()
-        output_path = working_dir / f"{dataset_prefix}_metadata.xml"
-        args = [
-            "download-metadata",
-            f"--endpoint={endpoint}",
-            f"--agency={agency}",
-            f"--dataflow={dataflow}",
-            f"--output_path={output_path}",
-        ]
-        if self._config.run.verbose:
-            args.append("--verbose")
-        full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
-        self._context = DownloadMetadataStep._StepContext(
-            full_command=full_command, output_path=output_path)
-        return self._context
-
-    def run(self) -> None:
-        context = self._prepare_command()
-        if self._config.run.verbose:
-            logging.info(
-                f"Starting SDMX metadata download: {' '.join(context.full_command)} -> {context.output_path}"
-            )
-        else:
-            logging.info(f"Downloading SDMX metadata to {context.output_path}")
-        _run_command(context.full_command, verbose=self._config.run.verbose)
-
-    def dry_run(self) -> None:
-        context = self._prepare_command()
-        logging.info(
-            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
-        )
-
-
-class CreateSampleStep(SdmxStep):
-    """Creates a sample dataset from downloaded data."""
-
-    VERSION = 1
-
-    @dataclass(frozen=True)
-    class _StepContext:
-        input_path: Path
-        full_command: list[str]
-        output_path: Path
-
-    def __init__(self, *, name: str, config: PipelineConfig) -> None:
-        super().__init__(name=name, version=self.VERSION, config=config)
-        self._context: CreateSampleStep._StepContext | None = None
-
-    def _prepare_command(self) -> _StepContext:
-        if self._context:
-            return self._context
-        dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir).resolve()
-        input_path = working_dir / f"{dataset_prefix}_data.csv"
-        output_path = working_dir / f"{dataset_prefix}_sample.csv"
-
-        args = [
-            f"--sampler_input={input_path}",
-            f"--sampler_output={output_path}",
-            f"--sampler_output_rows={self._config.sample.rows}",
-        ]
-        full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args
-        self._context = CreateSampleStep._StepContext(input_path=input_path,
-                                                      full_command=full_command,
-                                                      output_path=output_path)
-        return self._context
-
-    def run(self) -> None:
-        context = self._prepare_command()
-        if not context.input_path.is_file():
-            raise RuntimeError(
-                f"Input file missing for sampling: {context.input_path}")
-        if self._config.run.verbose:
-            logging.info(
-                f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}"
-            )
-        else:
-            logging.info(f"Sampling data to {context.output_path}")
-        _run_command(context.full_command, verbose=self._config.run.verbose)
-
-    def dry_run(self) -> None:
-        context = self._prepare_command()
-        logging.info(
-            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
-        )
-
-
-class CreateSchemaMapStep(SdmxStep):
-    """Builds schema mappings for transformed data."""
-
-    VERSION = 1
-
-    @dataclass(frozen=True)
-    class _StepContext:
-        sample_path: Path
-        metadata_path: Path
-        output_prefix: Path
-        full_command: list[str]
-
-    def __init__(self, *, name: str, config: PipelineConfig) -> None:
-        super().__init__(name=name, version=self.VERSION, config=config)
-        self._context: CreateSchemaMapStep._StepContext | None = None
-
-    def _prepare_command(self) -> _StepContext:
-        if self._context:
-            return self._context
-        dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir).resolve()
-        sample_path = working_dir / f"{dataset_prefix}_sample.csv"
-        metadata_path = working_dir / f"{dataset_prefix}_metadata.xml"
-        output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix
-
-        args = [
-            f"--input_data={sample_path}",
-            f"--input_metadata={metadata_path}",
-            "--sdmx_dataset",
-            f"--output_path={output_prefix}",
-        ]
-        if self._config.run.skip_confirmation:
-            args.append("--skip_confirmation")
-        if self._config.run.gemini_cli:
-            args.append(f"--gemini_cli={self._config.run.gemini_cli}")
-        args.append(f"--working_dir={working_dir}")
-
-        full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args
-        self._context = CreateSchemaMapStep._StepContext(
-            sample_path=sample_path,
-            metadata_path=metadata_path,
-            output_prefix=output_prefix,
-            full_command=full_command)
-        return self._context
-
-    def run(self) -> None:
-        context = self._prepare_command()
-        if not context.sample_path.is_file():
-            raise RuntimeError(f"Sample file missing: {context.sample_path}")
-        if not context.metadata_path.is_file():
-            raise RuntimeError(
-                f"Metadata file missing: {context.metadata_path}")
-        context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
-        logging.info(
-            f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}"
-        )
-        _run_command(context.full_command, verbose=self._config.run.verbose)
-
-    def dry_run(self) -> None:
-        context = self._prepare_command()
-        logging.info(
-            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
-        )
-
-
-class ProcessFullDataStep(SdmxStep):
-    """Processes full SDMX data into DC artifacts."""
-
-    VERSION = 1
-
-    RUN_OUTPUT_COLUMNS: ClassVar[str] = (
-        "observationDate,observationAbout,variableMeasured,value,"
-        "observationPeriod,measurementMethod,unit,scalingFactor")
-
-    @dataclass(frozen=True)
-    class _StepContext:
-        input_data_path: Path
-        pv_map_path: Path
-        metadata_path: Path
-        full_command: list[str]
-        output_prefix: Path
-
-    def __init__(self, *, name: str, config: PipelineConfig) -> None:
-        super().__init__(name=name, version=self.VERSION, config=config)
-        self._context: ProcessFullDataStep._StepContext | None = None
-
-    def _prepare_command(self) -> _StepContext:
-        if self._context:
-            return self._context
-        dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir).resolve()
-        input_data_path = working_dir / f"{dataset_prefix}_data.csv"
-        pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR /
-                       f"{dataset_prefix}_pvmap.csv")
-        metadata_path = (working_dir / SAMPLE_OUTPUT_DIR /
-                         f"{dataset_prefix}_metadata.csv")
-        output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix
-
-        args = [
-            f"--input_data={input_data_path}",
-            f"--pv_map={pv_map_path}",
-            f"--config_file={metadata_path}",
-            "--generate_statvar_name=True",
-            "--skip_constant_csv_columns=False",
-            f"--output_columns={self.RUN_OUTPUT_COLUMNS}",
-            f"--output_path={output_prefix}",
-        ]
-        full_command = [sys.executable, str(STAT_VAR_PROCESSOR_PATH)] + args
-        self._context = ProcessFullDataStep._StepContext(
-            input_data_path=input_data_path,
-            pv_map_path=pv_map_path,
-            metadata_path=metadata_path,
-            full_command=full_command,
-            output_prefix=output_prefix,
-        )
-        return self._context
-
-    def run(self) -> None:
-        context = self._prepare_command()
-        for required in (context.input_data_path, context.pv_map_path,
-                         context.metadata_path):
-            if not required.is_file():
-                raise RuntimeError(
-                    f"{self.name} requires existing input: {required}")
-        # Ensure output directory exists
-        context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
-        logging.info(
-            f"Starting stat_var_processor: input={context.input_data_path} "
-            f"pvmap={context.pv_map_path} metadata={context.metadata_path} -> "
-            f"{context.output_prefix}")
-        _run_command(context.full_command, verbose=self._config.run.verbose)
-
-    def dry_run(self) -> None:
-        context = self._prepare_command()
-        logging.info(
-            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
-        )
-
-
-class CreateDcConfigStep(SdmxStep):
-    """Generates Datacommons configuration artifacts."""
-
-    VERSION = 1
-
-    @dataclass(frozen=True)
-    class _StepContext:
-        input_csv: Path
-        output_config: Path
-        full_command: list[str]
-
-    def __init__(self, *, name: str, config: PipelineConfig) -> None:
-        super().__init__(name=name, version=self.VERSION, config=config)
-        self._context: CreateDcConfigStep._StepContext | None = None
-
-    def _prepare_command(self) -> _StepContext:
-        if self._context:
-            return self._context
-        dataset_prefix = self._config.run.dataset_prefix
-        working_dir = Path(self._config.run.working_dir).resolve()
-        input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv"
-        output_config = (working_dir / FINAL_OUTPUT_DIR /
-                         f"{dataset_prefix}_config.json")
-
-        endpoint = _require_config_field(self._config.sdmx.endpoint,
-                                         _FLAG_SDMX_ENDPOINT, self.name)
-        agency = _require_config_field(self._config.sdmx.agency,
-                                       _FLAG_SDMX_AGENCY, self.name)
-        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
-                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
-
-        dataset_url = (f"{endpoint.rstrip('/')}/data/"
-                       f"{agency},{dataflow},")
-
-        args = [
-            f"--input_csv={input_csv}",
-            f"--output_config={output_config}",
-            f"--provenance_name={dataflow}",
-            f"--source_name={agency}",
-            f"--data_source_url={endpoint}",
-            f"--dataset_url={dataset_url}",
-        ]
-        full_command = [sys.executable, str(DC_CONFIG_GENERATOR_PATH)] + args
-        self._context = CreateDcConfigStep._StepContext(
-            input_csv=input_csv,
-            output_config=output_config,
-            full_command=full_command)
-        return self._context
-
-    def run(self) -> None:
-        context = self._prepare_command()
-        if not context.input_csv.is_file():
-            raise RuntimeError(
-                f"{self.name} requires existing input: {context.input_csv}")
-
-        logging.info(
-            f"Starting custom DC config generation: input={context.input_csv} -> {context.output_config}"
-        )
-        _run_command(context.full_command, verbose=self._config.run.verbose)
-
-    def dry_run(self) -> None:
-        context = self._prepare_command()
-        logging.info(
-            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
-        )
-
-
-class PipelineBuilder:
-
-    def __init__(self,
-                 *,
-                 config: PipelineConfig,
-                 state: PipelineState,
-                 steps: Sequence[Step],
-                 critical_input_hash: str | None = None) -> None:
-        self._config = config
-        self._state = state
-        self._steps = steps
-        self._critical_input_hash = critical_input_hash
-
-    def build(self) -> BuildResult:
-        if self._config.run.run_only:
-            planned, decisions = self._plan_run_only(self._config.run.run_only)
-        elif self._config.run.force:
-            logging.info("Force flag set; scheduling all SDMX steps")
-            planned, decisions = self._plan_all_steps(
-                "Force flag set; scheduling this step")
-        elif self._hash_changed():
-            logging.info("Critical inputs changed; scheduling all SDMX steps")
-            planned, decisions = self._plan_all_steps(
-                "Critical inputs changed; scheduling this step")
-        else:
-            planned, decisions = self._plan_incremental()
-        logging.info("Built SDMX pipeline with %d steps", len(planned))
-        return BuildResult(pipeline=Pipeline(steps=planned),
-                           decisions=decisions)
-
-    def _plan_run_only(self,
-                       run_only: str) -> tuple[list[Step], list[StepDecision]]:
-        planned: list[Step] = []
-        decisions: list[StepDecision] = []
-        for step in self._steps:
-            if step.name == run_only:
-                planned.append(step)
-                decisions.append(
-                    StepDecision(
-                        step_name=step.name,
-                        decision=StepDecision.RUN,
-                        reason=(f"run_only={run_only} requested; running only "
-                                "this step"),
-                    ))
-            else:
-                decisions.append(
-                    StepDecision(
-                        step_name=step.name,
-                        decision=StepDecision.SKIP,
-                        reason=(f"run_only={run_only} requested; skipping "
-                                "this step"),
-                    ))
-        if not planned:
-            raise ValueError(f"run_only step not found: {run_only}")
-        return planned, decisions
-
-    def _plan_all_steps(self,
-                        reason: str) -> tuple[list[Step], list[StepDecision]]:
-        planned: list[Step] = []
-        decisions: list[StepDecision] = []
-        for step in self._steps:
-            planned.append(step)
-            decisions.append(
-                StepDecision(step_name=step.name,
-                             decision=StepDecision.RUN,
-                             reason=reason))
-        return planned, decisions
-
-    def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]:
-        planned: list[Step] = []
-        decisions: list[StepDecision] = []
-        schedule_all_remaining = False
-        previous: Step | None = None
-        for step in self._steps:
-            if schedule_all_remaining:
-                planned.append(step)
-                decisions.append(
-                    StepDecision(
-                        step_name=step.name,
-                        decision=StepDecision.RUN,
-                        reason=("Upstream step triggered rerun for remaining "
-                                "steps"),
-                    ))
-                previous = step
-                continue
-
-            prev_state = self._state.steps.get(step.name)
-            if prev_state is None:
-                needs_run = True
-                reason = "No previous state recorded; scheduling step"
-            elif prev_state.status != "succeeded":
-                needs_run = True
-                reason = (f"Previous run status was {prev_state.status}; "
-                          "rerunning step")
-            elif prev_state.version < step.version:
-                needs_run = True
-                reason = (
-                    f"Step version increased from {prev_state.version} to "
-                    f"{step.version}; rerunning step")
-            else:
-                needs_run = False
-                reason = ("Previous run succeeded with same version; step is "
-                          "up-to-date")
-
-            if not needs_run and previous is not None:
-                if self._predecessor_newer(previous, step):
-                    needs_run = True
-                    reason = (f"Previous step {previous.name} finished more "
-                              "recently; rerunning downstream steps")
-
-            if needs_run:
-                planned.append(step)
-                decisions.append(
-                    StepDecision(step_name=step.name,
-                                 decision=StepDecision.RUN,
-                                 reason=reason))
-                schedule_all_remaining = True
-            else:
-                decisions.append(
-                    StepDecision(step_name=step.name,
-                                 decision=StepDecision.SKIP,
-                                 reason=reason))
-            previous = step
-
-        if not planned:
-            logging.info("No steps scheduled.")
-        return planned, decisions
-
-    def _hash_changed(self) -> bool:
-        if not self._critical_input_hash:
-            return False
-        previous = self._state.critical_input_hash
-        if not previous:
-            return True
-        return previous != self._critical_input_hash
-
-    def _predecessor_newer(self, prev_step: Step, step: Step) -> bool:
-        prev_state = self._state.steps.get(prev_step.name)
-        curr_state = self._state.steps.get(step.name)
-        if prev_state is None or prev_state.ended_at_ts is None:
-            return False
-        if curr_state is None:
-            return True
-        if curr_state.status != "succeeded":
-            return True
-        if curr_state.ended_at_ts is None:
-            return True
-        return prev_state.ended_at_ts > curr_state.ended_at_ts
-
-
-def build_steps(config: PipelineConfig) -> list[Step]:
-    """Constructs the hard-coded list of canonical steps."""
-    return [
-        DownloadDataStep(name="download-data", config=config),
-        DownloadMetadataStep(name="download-metadata", config=config),
-        CreateSampleStep(name="create-sample", config=config),
-        CreateSchemaMapStep(name="create-schema-mapping", config=config),
-        ProcessFullDataStep(name="process-full-data", config=config),
-        CreateDcConfigStep(name="create-dc-config", config=config),
-    ]
-
-
-def _log_step_decisions(decisions: Sequence[StepDecision]) -> None:
-    for decision in decisions:
-        logging.info("step=%s decision=%s reason=%s", decision.step_name,
-                     decision.decision, decision.reason)
-
-
-def build_sdmx_pipeline(*,
-                        config: PipelineConfig,
-                        state: PipelineState,
-                        steps: Sequence[Step] | None = None,
-                        critical_input_hash: str | None = None) -> Pipeline:
-    builder_steps = steps if steps is not None else build_steps(config)
-    builder = PipelineBuilder(config=config,
-                              state=state,
-                              steps=builder_steps,
-                              critical_input_hash=critical_input_hash)
-    result = builder.build()
-    _log_step_decisions(result.decisions)
-    return result.pipeline
-
-
 def run_sdmx_pipeline(
     *,
     config: PipelineConfig,
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 6f971faecb..88b86b5721 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -44,11 +44,15 @@
     RunnerConfig,
 )
 from tools.agentic_import.sdmx_import_pipeline import (  # pylint: disable=import-error
-    InteractiveCallback, JSONStateCallback, PipelineBuilder, PipelineConfig,
-    StepDecision, build_pipeline_callback, build_sdmx_pipeline, build_steps,
-    run_sdmx_pipeline, DownloadMetadataStep, DownloadDataStep, CreateSampleStep,
-    CreateSchemaMapStep, ProcessFullDataStep, CreateDcConfigStep, _run_command,
-    SdmxConfig, SampleConfig, RunConfig, SdmxDataflowConfig, SdmxStep)
+    InteractiveCallback, JSONStateCallback, build_pipeline_callback,
+    run_sdmx_pipeline)
+from tools.agentic_import.sdmx_pipeline_builder import (  # pylint: disable=import-error
+    PipelineBuilder, StepDecision, build_sdmx_pipeline, build_steps)
+from tools.agentic_import.sdmx_pipeline_config import (  # pylint: disable=import-error
+    PipelineConfig, RunConfig, SampleConfig, SdmxConfig, SdmxDataflowConfig)
+from tools.agentic_import.sdmx_pipeline_steps import (  # pylint: disable=import-error
+    CreateDcConfigStep, CreateSampleStep, CreateSchemaMapStep, DownloadDataStep,
+    DownloadMetadataStep, ProcessFullDataStep, SdmxStep, _run_command)
 from tools.agentic_import.state_handler import (  # pylint: disable=import-error
     PipelineState, StateHandler, StepState)
 
@@ -507,7 +511,7 @@ def setUp(self) -> None:
         super().setUp()
         # Mock _run_command to avoid actual execution during pipeline tests
         self._run_command_patcher = mock.patch(
-            "tools.agentic_import.sdmx_import_pipeline._run_command")
+            "tools.agentic_import.sdmx_pipeline_steps._run_command")
         self._mock_run_command = self._run_command_patcher.start()
         self.addCleanup(self._run_command_patcher.stop)
 
@@ -662,7 +666,7 @@ def _assert_run_and_dry_run_use_same_plan(
             extra_cmd_checks=None,
             expect_verbose: bool = True) -> None:
         extra_cmd_checks = extra_cmd_checks or []
-        with mock.patch("tools.agentic_import.sdmx_import_pipeline._run_command"
+        with mock.patch("tools.agentic_import.sdmx_pipeline_steps._run_command"
                        ) as mock_run_cmd:
             with self.assertLogs(logging.get_absl_logger(),
                                  level="INFO") as logs:
diff --git a/tools/agentic_import/sdmx_pipeline_builder.py b/tools/agentic_import/sdmx_pipeline_builder.py
new file mode 100644
index 0000000000..8d8c01ee64
--- /dev/null
+++ b/tools/agentic_import/sdmx_pipeline_builder.py
@@ -0,0 +1,231 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Builder for the SDMX agentic import pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import ClassVar, Sequence
+
+from absl import logging
+
+from tools.agentic_import.pipeline import Pipeline, Step
+from tools.agentic_import.sdmx_pipeline_config import PipelineConfig
+from tools.agentic_import.sdmx_pipeline_steps import (
+    CreateDcConfigStep, CreateSampleStep, CreateSchemaMapStep, DownloadDataStep,
+    DownloadMetadataStep, ProcessFullDataStep)
+from tools.agentic_import.state_handler import PipelineState
+
+
+@dataclass(frozen=True)
+class StepDecision:
+    """Represents whether a step will run and why."""
+
+    RUN: ClassVar[str] = "RUN"
+    SKIP: ClassVar[str] = "SKIP"
+
+    step_name: str
+    decision: str
+    reason: str
+
+
+@dataclass(frozen=True)
+class BuildResult:
+    """Output of planning that includes the pipeline and per-step decisions."""
+
+    pipeline: Pipeline
+    decisions: list[StepDecision]
+
+
+class PipelineBuilder:
+
+    def __init__(self,
+                 *,
+                 config: PipelineConfig,
+                 state: PipelineState,
+                 steps: Sequence[Step],
+                 critical_input_hash: str | None = None) -> None:
+        self._config = config
+        self._state = state
+        self._steps = steps
+        self._critical_input_hash = critical_input_hash
+
+    def build(self) -> BuildResult:
+        if self._config.run.run_only:
+            planned, decisions = self._plan_run_only(self._config.run.run_only)
+        elif self._config.run.force:
+            logging.info("Force flag set; scheduling all SDMX steps")
+            planned, decisions = self._plan_all_steps(
+                "Force flag set; scheduling this step")
+        elif self._hash_changed():
+            logging.info("Critical inputs changed; scheduling all SDMX steps")
+            planned, decisions = self._plan_all_steps(
+                "Critical inputs changed; scheduling this step")
+        else:
+            planned, decisions = self._plan_incremental()
+        logging.info("Built SDMX pipeline with %d steps", len(planned))
+        return BuildResult(pipeline=Pipeline(steps=planned),
+                           decisions=decisions)
+
+    def _plan_run_only(self,
+                       run_only: str) -> tuple[list[Step], list[StepDecision]]:
+        planned: list[Step] = []
+        decisions: list[StepDecision] = []
+        for step in self._steps:
+            if step.name == run_only:
+                planned.append(step)
+                decisions.append(
+                    StepDecision(
+                        step_name=step.name,
+                        decision=StepDecision.RUN,
+                        reason=(f"run_only={run_only} requested; running only "
+                                "this step"),
+                    ))
+            else:
+                decisions.append(
+                    StepDecision(
+                        step_name=step.name,
+                        decision=StepDecision.SKIP,
+                        reason=(f"run_only={run_only} requested; skipping "
+                                "this step"),
+                    ))
+        if not planned:
+            raise ValueError(f"run_only step not found: {run_only}")
+        return planned, decisions
+
+    def _plan_all_steps(self,
+                        reason: str) -> tuple[list[Step], list[StepDecision]]:
+        planned: list[Step] = []
+        decisions: list[StepDecision] = []
+        for step in self._steps:
+            planned.append(step)
+            decisions.append(
+                StepDecision(step_name=step.name,
+                             decision=StepDecision.RUN,
+                             reason=reason))
+        return planned, decisions
+
+    def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]:
+        planned: list[Step] = []
+        decisions: list[StepDecision] = []
+        schedule_all_remaining = False
+        previous: Step | None = None
+        for step in self._steps:
+            if schedule_all_remaining:
+                planned.append(step)
+                decisions.append(
+                    StepDecision(
+                        step_name=step.name,
+                        decision=StepDecision.RUN,
+                        reason=("Upstream step triggered rerun for remaining "
+                                "steps"),
+                    ))
+                previous = step
+                continue
+
+            prev_state = self._state.steps.get(step.name)
+            if prev_state is None:
+                needs_run = True
+                reason = "No previous state recorded; scheduling step"
+            elif prev_state.status != "succeeded":
+                needs_run = True
+                reason = (f"Previous run status was {prev_state.status}; "
+                          "rerunning step")
+            elif prev_state.version < step.version:
+                needs_run = True
+                reason = (
+                    f"Step version increased from {prev_state.version} to "
+                    f"{step.version}; rerunning step")
+            else:
+                needs_run = False
+                reason = ("Previous run succeeded with same version; step is "
+                          "up-to-date")
+
+            if not needs_run and previous is not None:
+                if self._predecessor_newer(previous, step):
+                    needs_run = True
+                    reason = (f"Previous step {previous.name} finished more "
+                              "recently; rerunning downstream steps")
+
+            if needs_run:
+                planned.append(step)
+                decisions.append(
+                    StepDecision(step_name=step.name,
+                                 decision=StepDecision.RUN,
+                                 reason=reason))
+                schedule_all_remaining = True
+            else:
+                decisions.append(
+                    StepDecision(step_name=step.name,
+                                 decision=StepDecision.SKIP,
+                                 reason=reason))
+            previous = step
+
+        if not planned:
+            logging.info("No steps scheduled.")
+        return planned, decisions
+
+    def _hash_changed(self) -> bool:
+        if not self._critical_input_hash:
+            return False
+        previous = self._state.critical_input_hash
+        if not previous:
+            return True
+        return previous != self._critical_input_hash
+
+    def _predecessor_newer(self, prev_step: Step, step: Step) -> bool:
+        prev_state = self._state.steps.get(prev_step.name)
+        curr_state = self._state.steps.get(step.name)
+        if prev_state is None or prev_state.ended_at_ts is None:
+            return False
+        if curr_state is None:
+            return True
+        if curr_state.status != "succeeded":
+            return True
+        if curr_state.ended_at_ts is None:
+            return True
+        return prev_state.ended_at_ts > curr_state.ended_at_ts
+
+
+def build_steps(config: PipelineConfig) -> list[Step]:
+    """Constructs the hard-coded list of canonical steps."""
+    return [
+        DownloadDataStep(name="download-data", config=config),
+        DownloadMetadataStep(name="download-metadata", config=config),
+        CreateSampleStep(name="create-sample", config=config),
+        CreateSchemaMapStep(name="create-schema-mapping", config=config),
+        ProcessFullDataStep(name="process-full-data", config=config),
+        CreateDcConfigStep(name="create-dc-config", config=config),
+    ]
+
+
+def _log_step_decisions(decisions: Sequence[StepDecision]) -> None:
+    for decision in decisions:
+        logging.info("step=%s decision=%s reason=%s", decision.step_name,
+                     decision.decision, decision.reason)
+
+
+def build_sdmx_pipeline(*,
+                        config: PipelineConfig,
+                        state: PipelineState,
+                        steps: Sequence[Step] | None = None,
+                        critical_input_hash: str | None = None) -> Pipeline:
+    builder_steps = steps if steps is not None else build_steps(config)
+    builder = PipelineBuilder(config=config,
+                              state=state,
+                              steps=builder_steps,
+                              critical_input_hash=critical_input_hash)
+    result = builder.build()
+    _log_step_decisions(result.decisions)
+    return result.pipeline
diff --git a/tools/agentic_import/sdmx_pipeline_config.py b/tools/agentic_import/sdmx_pipeline_config.py
new file mode 100644
index 0000000000..dd9683518b
--- /dev/null
+++ b/tools/agentic_import/sdmx_pipeline_config.py
@@ -0,0 +1,61 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configuration dataclasses for the SDMX agentic import pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass(frozen=True)
+class SdmxDataflowConfig:
+    """Configuration for SDMX dataflow."""
+    id: str | None = None
+    key: str | None = None
+    param: str | None = None
+
+
+@dataclass(frozen=True)
+class SdmxConfig:
+    """Configuration for SDMX data access."""
+    endpoint: str | None = None
+    agency: str | None = None
+    dataflow: SdmxDataflowConfig = field(default_factory=SdmxDataflowConfig)
+
+
+@dataclass(frozen=True)
+class SampleConfig:
+    """Configuration for data sampling."""
+    rows: int = 1000
+
+
+@dataclass(frozen=True)
+class RunConfig:
+    """Configuration for pipeline execution."""
+    command: str
+    dataset_prefix: str | None = None
+    working_dir: str | None = None
+    run_only: str | None = None
+    force: bool = False
+    verbose: bool = False
+    skip_confirmation: bool = False
+    gemini_cli: str | None = None
+
+
+@dataclass(frozen=True)
+class PipelineConfig:
+    """Aggregated configuration for the pipeline."""
+    sdmx: SdmxConfig = field(default_factory=SdmxConfig)
+    sample: SampleConfig = field(default_factory=SampleConfig)
+    run: RunConfig = field(default_factory=lambda: RunConfig(command="python"))
diff --git a/tools/agentic_import/sdmx_pipeline_steps.py b/tools/agentic_import/sdmx_pipeline_steps.py
new file mode 100644
index 0000000000..7d25165b66
--- /dev/null
+++ b/tools/agentic_import/sdmx_pipeline_steps.py
@@ -0,0 +1,467 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Step implementations for the SDMX agentic import pipeline."""
+
+from __future__ import annotations
+
+import abc
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, Sequence
+
+from absl import logging
+
+from tools.agentic_import.pipeline import Step
+from tools.agentic_import.sdmx_pipeline_config import PipelineConfig
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+SDMX_CLI_PATH = REPO_ROOT / "tools" / "sdmx_import" / "sdmx_cli.py"
+DATA_SAMPLER_PATH = REPO_ROOT / "tools" / "statvar_importer" / "data_sampler.py"
+STAT_VAR_PROCESSOR_PATH = (REPO_ROOT / "tools" / "statvar_importer" /
+                           "stat_var_processor.py")
+PVMAP_GENERATOR_PATH = REPO_ROOT / "tools" / "agentic_import" / "pvmap_generator.py"
+DC_CONFIG_GENERATOR_PATH = (REPO_ROOT / "tools" / "agentic_import" /
+                            "generate_custom_dc_config.py")
+
+SAMPLE_OUTPUT_DIR = Path("sample_output")
+FINAL_OUTPUT_DIR = Path("output")
+
+# Flag names (copied for reference/usage in steps if needed,
+# though they are mostly used in main for flag definition)
+_FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
+_FLAG_SDMX_AGENCY = "sdmx.agency"
+_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id"
+
+
+def _require_config_field(value: str | None, field_name: str,
+                          step_name: str) -> str:
+    if value:
+        return value
+    raise ValueError(f"{step_name} requires config.{field_name}")
+
+
+def _run_command(command: Sequence[str], *, verbose: bool) -> None:
+    if verbose:
+        logging.debug(f"Running command: {' '.join(command)}")
+    subprocess.run(command, check=True)
+
+
+def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
+    command = [sys.executable, str(SDMX_CLI_PATH), *args]
+    _run_command(command, verbose=verbose)
+
+
+class SdmxStep(Step):
+    """Base class for SDMX steps that carries immutable config and version."""
+
+    def __init__(self, *, name: str, version: int,
+                 config: PipelineConfig) -> None:
+        if not name:
+            raise ValueError("step requires a name")
+        self._name = name
+        self._version = version
+        self._config = config
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def version(self) -> int:
+        return self._version
+
+    @abc.abstractmethod
+    def dry_run(self) -> None:
+        """Log a read-only preview of the work to be done."""
+
+
+class DownloadDataStep(SdmxStep):
+    """Downloads SDMX data payloads."""
+
+    VERSION = 1
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        full_command: list[str]
+        output_path: Path
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: DownloadDataStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        endpoint = _require_config_field(self._config.sdmx.endpoint,
+                                         _FLAG_SDMX_ENDPOINT, self.name)
+        agency = _require_config_field(self._config.sdmx.agency,
+                                       _FLAG_SDMX_AGENCY, self.name)
+        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
+                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir).resolve()
+        output_path = working_dir / f"{dataset_prefix}_data.csv"
+        args = [
+            "download-data",
+            f"--endpoint={endpoint}",
+            f"--agency={agency}",
+            f"--dataflow={dataflow}",
+            f"--output_path={output_path}",
+        ]
+        if self._config.sdmx.dataflow.key:
+            args.append(f"--key={self._config.sdmx.dataflow.key}")
+        if self._config.sdmx.dataflow.param:
+            args.append(f"--param={self._config.sdmx.dataflow.param}")
+        if self._config.run.verbose:
+            args.append("--verbose")
+        full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
+        self._context = DownloadDataStep._StepContext(full_command=full_command,
+                                                      output_path=output_path)
+        return self._context
+
+    def run(self) -> None:
+        context = self._prepare_command()
+        if self._config.run.verbose:
+            logging.info(
+                f"Starting SDMX data download: {' '.join(context.full_command)} -> {context.output_path}"
+            )
+        else:
+            logging.info(f"Downloading SDMX data to {context.output_path}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
+
+    def dry_run(self) -> None:
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
+
+
+class DownloadMetadataStep(SdmxStep):
+    """Downloads SDMX metadata payloads."""
+
+    VERSION = 1
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        full_command: list[str]
+        output_path: Path
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: DownloadMetadataStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        endpoint = _require_config_field(self._config.sdmx.endpoint,
+                                         _FLAG_SDMX_ENDPOINT, self.name)
+        agency = _require_config_field(self._config.sdmx.agency,
+                                       _FLAG_SDMX_AGENCY, self.name)
+        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
+                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir).resolve()
+        output_path = working_dir / f"{dataset_prefix}_metadata.xml"
+        args = [
+            "download-metadata",
+            f"--endpoint={endpoint}",
+            f"--agency={agency}",
+            f"--dataflow={dataflow}",
+            f"--output_path={output_path}",
+        ]
+        if self._config.run.verbose:
+            args.append("--verbose")
+        full_command = [sys.executable, str(SDMX_CLI_PATH)] + args
+        self._context = DownloadMetadataStep._StepContext(
+            full_command=full_command, output_path=output_path)
+        return self._context
+
+    def run(self) -> None:
+        context = self._prepare_command()
+        if self._config.run.verbose:
+            logging.info(
+                f"Starting SDMX metadata download: {' '.join(context.full_command)} -> {context.output_path}"
+            )
+        else:
+            logging.info(f"Downloading SDMX metadata to {context.output_path}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
+
+    def dry_run(self) -> None:
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
+
+
+class CreateSampleStep(SdmxStep):
+    """Creates a sample dataset from downloaded data."""
+
+    VERSION = 1
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        input_path: Path
+        full_command: list[str]
+        output_path: Path
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: CreateSampleStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir).resolve()
+        input_path = working_dir / f"{dataset_prefix}_data.csv"
+        output_path = working_dir / f"{dataset_prefix}_sample.csv"
+
+        args = [
+            f"--sampler_input={input_path}",
+            f"--sampler_output={output_path}",
+            f"--sampler_output_rows={self._config.sample.rows}",
+        ]
+        full_command = [sys.executable, str(DATA_SAMPLER_PATH)] + args
+        self._context = CreateSampleStep._StepContext(input_path=input_path,
+                                                      full_command=full_command,
+                                                      output_path=output_path)
+        return self._context
+
+    def run(self) -> None:
+        context = self._prepare_command()
+        if not context.input_path.is_file():
+            raise RuntimeError(
+                f"Input file missing for sampling: {context.input_path}")
+        if self._config.run.verbose:
+            logging.info(
+                f"Starting data sampling: {' '.join(context.full_command)} -> {context.output_path}"
+            )
+        else:
+            logging.info(f"Sampling data to {context.output_path}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
+
+    def dry_run(self) -> None:
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
+
+
+class CreateSchemaMapStep(SdmxStep):
+    """Builds schema mappings for transformed data."""
+
+    VERSION = 1
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        sample_path: Path
+        metadata_path: Path
+        output_prefix: Path
+        full_command: list[str]
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: CreateSchemaMapStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir).resolve()
+        sample_path = working_dir / f"{dataset_prefix}_sample.csv"
+        metadata_path = working_dir / f"{dataset_prefix}_metadata.xml"
+        output_prefix = working_dir / SAMPLE_OUTPUT_DIR / dataset_prefix
+
+        args = [
+            f"--input_data={sample_path}",
+            f"--input_metadata={metadata_path}",
+            "--sdmx_dataset",
+            f"--output_path={output_prefix}",
+        ]
+        if self._config.run.skip_confirmation:
+            args.append("--skip_confirmation")
+        if self._config.run.gemini_cli:
+            args.append(f"--gemini_cli={self._config.run.gemini_cli}")
+        args.append(f"--working_dir={working_dir}")
+
+        full_command = [sys.executable, str(PVMAP_GENERATOR_PATH)] + args
+        self._context = CreateSchemaMapStep._StepContext(
+            sample_path=sample_path,
+            metadata_path=metadata_path,
+            output_prefix=output_prefix,
+            full_command=full_command)
+        return self._context
+
+    def run(self) -> None:
+        context = self._prepare_command()
+        if not context.sample_path.is_file():
+            raise RuntimeError(f"Sample file missing: {context.sample_path}")
+        if not context.metadata_path.is_file():
+            raise RuntimeError(
+                f"Metadata file missing: {context.metadata_path}")
+        context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
+        logging.info(
+            f"Starting PV map generation: {' '.join(context.full_command)} -> {context.output_prefix}"
+        )
+        _run_command(context.full_command, verbose=self._config.run.verbose)
+
+    def dry_run(self) -> None:
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
+
+
+class ProcessFullDataStep(SdmxStep):
+    """Processes full SDMX data into DC artifacts."""
+
+    VERSION = 1
+
+    RUN_OUTPUT_COLUMNS: ClassVar[str] = (
+        "observationDate,observationAbout,variableMeasured,value,"
+        "observationPeriod,measurementMethod,unit,scalingFactor")
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        input_data_path: Path
+        pv_map_path: Path
+        metadata_path: Path
+        full_command: list[str]
+        output_prefix: Path
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: ProcessFullDataStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir).resolve()
+        input_data_path = working_dir / f"{dataset_prefix}_data.csv"
+        pv_map_path = (working_dir / SAMPLE_OUTPUT_DIR /
+                       f"{dataset_prefix}_pvmap.csv")
+        metadata_path = (working_dir / SAMPLE_OUTPUT_DIR /
+                         f"{dataset_prefix}_metadata.csv")
+        output_prefix = working_dir / FINAL_OUTPUT_DIR / dataset_prefix
+
+        args = [
+            f"--input_data={input_data_path}",
+            f"--pv_map={pv_map_path}",
+            f"--config_file={metadata_path}",
+            "--generate_statvar_name=True",
+            "--skip_constant_csv_columns=False",
+            f"--output_columns={self.RUN_OUTPUT_COLUMNS}",
+            f"--output_path={output_prefix}",
+        ]
+        full_command = [sys.executable, str(STAT_VAR_PROCESSOR_PATH)] + args
+        self._context = ProcessFullDataStep._StepContext(
+            input_data_path=input_data_path,
+            pv_map_path=pv_map_path,
+            metadata_path=metadata_path,
+            full_command=full_command,
+            output_prefix=output_prefix,
+        )
+        return self._context
+
+    def run(self) -> None:
+        context = self._prepare_command()
+        for required in (context.input_data_path, context.pv_map_path,
+                         context.metadata_path):
+            if not required.is_file():
+                raise RuntimeError(
+                    f"{self.name} requires existing input: {required}")
+        # Ensure output directory exists
+        context.output_prefix.parent.mkdir(parents=True, exist_ok=True)
+        logging.info(
+            f"Starting stat_var_processor: input={context.input_data_path} "
+            f"pvmap={context.pv_map_path} metadata={context.metadata_path} -> "
+            f"{context.output_prefix}")
+        _run_command(context.full_command, verbose=self._config.run.verbose)
+
+    def dry_run(self) -> None:
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )
+
+
+class CreateDcConfigStep(SdmxStep):
+    """Generates Datacommons configuration artifacts."""
+
+    VERSION = 1
+
+    @dataclass(frozen=True)
+    class _StepContext:
+        input_csv: Path
+        output_config: Path
+        full_command: list[str]
+
+    def __init__(self, *, name: str, config: PipelineConfig) -> None:
+        super().__init__(name=name, version=self.VERSION, config=config)
+        self._context: CreateDcConfigStep._StepContext | None = None
+
+    def _prepare_command(self) -> _StepContext:
+        if self._context:
+            return self._context
+        dataset_prefix = self._config.run.dataset_prefix
+        working_dir = Path(self._config.run.working_dir).resolve()
+        input_csv = working_dir / FINAL_OUTPUT_DIR / f"{dataset_prefix}.csv"
+        output_config = (working_dir / FINAL_OUTPUT_DIR /
+                         f"{dataset_prefix}_config.json")
+
+        endpoint = _require_config_field(self._config.sdmx.endpoint,
+                                         _FLAG_SDMX_ENDPOINT, self.name)
+        agency = _require_config_field(self._config.sdmx.agency,
+                                       _FLAG_SDMX_AGENCY, self.name)
+        dataflow = _require_config_field(self._config.sdmx.dataflow.id,
+                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+
+        dataset_url = (f"{endpoint.rstrip('/')}/data/"
+                       f"{agency},{dataflow},")
+
+        args = [
+            f"--input_csv={input_csv}",
+            f"--output_config={output_config}",
+            f"--provenance_name={dataflow}",
+            f"--source_name={agency}",
+            f"--data_source_url={endpoint}",
+            f"--dataset_url={dataset_url}",
+        ]
+        full_command = [sys.executable, str(DC_CONFIG_GENERATOR_PATH)] + args
+        self._context = CreateDcConfigStep._StepContext(
+            input_csv=input_csv,
+            output_config=output_config,
+            full_command=full_command)
+        return self._context
+
+    def run(self) -> None:
+        context = self._prepare_command()
+        if not context.input_csv.is_file():
+            raise RuntimeError(
+                f"{self.name} requires existing input: {context.input_csv}")
+
+        logging.info(
+            f"Starting custom DC config generation: input={context.input_csv} -> {context.output_config}"
+        )
+        _run_command(context.full_command, verbose=self._config.run.verbose)
+
+    def dry_run(self) -> None:
+        context = self._prepare_command()
+        logging.info(
+            f"{self.name} (dry run): would run {' '.join(context.full_command)}"
+        )

From b10c1be46117145c105d2f8b524288cfc1cd9646 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Thu, 27 Nov 2025 09:18:20 +0000
Subject: [PATCH 51/54] refactor: centralize SDMX flag constants

---
 tools/agentic_import/sdmx_import_pipeline.py | 57 ++++++++++----------
 tools/agentic_import/sdmx_pipeline_config.py |  7 +++
 tools/agentic_import/sdmx_pipeline_steps.py  | 36 ++++++-------
 3 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/tools/agentic_import/sdmx_import_pipeline.py b/tools/agentic_import/sdmx_import_pipeline.py
index a26e6799f0..32957e6c6d 100644
--- a/tools/agentic_import/sdmx_import_pipeline.py
+++ b/tools/agentic_import/sdmx_import_pipeline.py
@@ -37,41 +37,44 @@
                                            PipelineCallback, PipelineRunner,
                                            RunnerConfig, Step)
 from tools.agentic_import.sdmx_pipeline_builder import build_sdmx_pipeline
-from tools.agentic_import.sdmx_pipeline_config import (PipelineConfig,
-                                                       RunConfig, SampleConfig,
-                                                       SdmxConfig,
-                                                       SdmxDataflowConfig)
+from tools.agentic_import.sdmx_pipeline_config import (
+    FLAG_SDMX_AGENCY,
+    FLAG_SDMX_DATAFLOW_ID,
+    FLAG_SDMX_DATAFLOW_KEY,
+    FLAG_SDMX_DATAFLOW_PARAM,
+    FLAG_SDMX_ENDPOINT,
+    PipelineConfig,
+    RunConfig,
+    SampleConfig,
+    SdmxConfig,
+    SdmxDataflowConfig,
+)
 from tools.agentic_import.sdmx_pipeline_steps import SdmxStep
 from tools.agentic_import.state_handler import StateHandler, StepState
 
 # Flag names
-_FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
-_FLAG_SDMX_AGENCY = "sdmx.agency"
-_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id"
-_FLAG_SDMX_DATAFLOW_KEY = "sdmx.dataflow.key"
-_FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param"
 _FLAG_SAMPLE_ROWS = "sample.rows"
 
 FLAGS = flags.FLAGS
 
 
 def _define_flags() -> None:
-    flags.DEFINE_string(_FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
-    flags.mark_flag_as_required(_FLAG_SDMX_ENDPOINT)
+    flags.DEFINE_string(FLAG_SDMX_ENDPOINT, None, "SDMX service endpoint.")
+    flags.mark_flag_as_required(FLAG_SDMX_ENDPOINT)
 
-    flags.DEFINE_string(_FLAG_SDMX_AGENCY, None,
+    flags.DEFINE_string(FLAG_SDMX_AGENCY, None,
                         "Owning SDMX agency identifier.")
-    flags.mark_flag_as_required(_FLAG_SDMX_AGENCY)
+    flags.mark_flag_as_required(FLAG_SDMX_AGENCY)
 
-    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_ID, None,
+    flags.DEFINE_string(FLAG_SDMX_DATAFLOW_ID, None,
                         "Target SDMX dataflow identifier.")
-    flags.mark_flag_as_required(_FLAG_SDMX_DATAFLOW_ID)
+    flags.mark_flag_as_required(FLAG_SDMX_DATAFLOW_ID)
 
-    flags.DEFINE_string(_FLAG_SDMX_DATAFLOW_KEY, None,
+    flags.DEFINE_string(FLAG_SDMX_DATAFLOW_KEY, None,
                         "Optional SDMX key or filter.")
 
     flags.DEFINE_string(
-        _FLAG_SDMX_DATAFLOW_PARAM, None,
+        FLAG_SDMX_DATAFLOW_PARAM, None,
         "Optional SDMX parameter appended to the dataflow query.")
 
     flags.DEFINE_integer(_FLAG_SAMPLE_ROWS, 1000,
@@ -126,11 +129,11 @@ def _resolve_dataset_prefix(config: PipelineConfig) -> str:
 
 def _compute_critical_input_hash(config: PipelineConfig) -> str:
     payload = {
-        _FLAG_SDMX_AGENCY: config.sdmx.agency,
-        _FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id,
-        _FLAG_SDMX_ENDPOINT: config.sdmx.endpoint,
-        _FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key,
-        _FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param,
+        FLAG_SDMX_AGENCY: config.sdmx.agency,
+        FLAG_SDMX_DATAFLOW_ID: config.sdmx.dataflow.id,
+        FLAG_SDMX_ENDPOINT: config.sdmx.endpoint,
+        FLAG_SDMX_DATAFLOW_KEY: config.sdmx.dataflow.key,
+        FLAG_SDMX_DATAFLOW_PARAM: config.sdmx.dataflow.param,
     }
     serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"))
     return hashlib.sha256(serialized.encode("utf-8")).hexdigest()
@@ -296,12 +299,12 @@ def prepare_config() -> PipelineConfig:
     command = shlex.join(sys.argv) if sys.argv else "python"
     return PipelineConfig(
         sdmx=SdmxConfig(
-            endpoint=FLAGS[_FLAG_SDMX_ENDPOINT].value,
-            agency=FLAGS[_FLAG_SDMX_AGENCY].value,
+            endpoint=FLAGS[FLAG_SDMX_ENDPOINT].value,
+            agency=FLAGS[FLAG_SDMX_AGENCY].value,
             dataflow=SdmxDataflowConfig(
-                id=FLAGS[_FLAG_SDMX_DATAFLOW_ID].value,
-                key=FLAGS[_FLAG_SDMX_DATAFLOW_KEY].value,
-                param=FLAGS[_FLAG_SDMX_DATAFLOW_PARAM].value,
+                id=FLAGS[FLAG_SDMX_DATAFLOW_ID].value,
+                key=FLAGS[FLAG_SDMX_DATAFLOW_KEY].value,
+                param=FLAGS[FLAG_SDMX_DATAFLOW_PARAM].value,
             ),
         ),
         sample=SampleConfig(rows=FLAGS[_FLAG_SAMPLE_ROWS].value,),
diff --git a/tools/agentic_import/sdmx_pipeline_config.py b/tools/agentic_import/sdmx_pipeline_config.py
index dd9683518b..d6260eabd7 100644
--- a/tools/agentic_import/sdmx_pipeline_config.py
+++ b/tools/agentic_import/sdmx_pipeline_config.py
@@ -17,6 +17,13 @@
 
 from dataclasses import dataclass, field
 
+# SDMX flag names shared across pipeline modules.
+FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
+FLAG_SDMX_AGENCY = "sdmx.agency"
+FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id"
+FLAG_SDMX_DATAFLOW_KEY = "sdmx.dataflow.key"
+FLAG_SDMX_DATAFLOW_PARAM = "sdmx.dataflow.param"
+
 
 @dataclass(frozen=True)
 class SdmxDataflowConfig:
diff --git a/tools/agentic_import/sdmx_pipeline_steps.py b/tools/agentic_import/sdmx_pipeline_steps.py
index 7d25165b66..53597972b8 100644
--- a/tools/agentic_import/sdmx_pipeline_steps.py
+++ b/tools/agentic_import/sdmx_pipeline_steps.py
@@ -25,7 +25,12 @@
 from absl import logging
 
 from tools.agentic_import.pipeline import Step
-from tools.agentic_import.sdmx_pipeline_config import PipelineConfig
+from tools.agentic_import.sdmx_pipeline_config import (
+    FLAG_SDMX_AGENCY,
+    FLAG_SDMX_DATAFLOW_ID,
+    FLAG_SDMX_ENDPOINT,
+    PipelineConfig,
+)
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
 
@@ -40,12 +45,6 @@
 SAMPLE_OUTPUT_DIR = Path("sample_output")
 FINAL_OUTPUT_DIR = Path("output")
 
-# Flag names (copied for reference/usage in steps if needed,
-# though they are mostly used in main for flag definition)
-_FLAG_SDMX_ENDPOINT = "sdmx.endpoint"
-_FLAG_SDMX_AGENCY = "sdmx.agency"
-_FLAG_SDMX_DATAFLOW_ID = "sdmx.dataflow.id"
-
 
 def _require_config_field(value: str | None, field_name: str,
                           step_name: str) -> str:
@@ -60,11 +59,6 @@ def _run_command(command: Sequence[str], *, verbose: bool) -> None:
     subprocess.run(command, check=True)
 
 
-def _run_sdmx_cli(args: Sequence[str], *, verbose: bool) -> None:
-    command = [sys.executable, str(SDMX_CLI_PATH), *args]
-    _run_command(command, verbose=verbose)
-
-
 class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
@@ -107,11 +101,11 @@ def _prepare_command(self) -> _StepContext:
         if self._context:
             return self._context
         endpoint = _require_config_field(self._config.sdmx.endpoint,
-                                         _FLAG_SDMX_ENDPOINT, self.name)
+                                         FLAG_SDMX_ENDPOINT, self.name)
         agency = _require_config_field(self._config.sdmx.agency,
-                                       _FLAG_SDMX_AGENCY, self.name)
+                                       FLAG_SDMX_AGENCY, self.name)
         dataflow = _require_config_field(self._config.sdmx.dataflow.id,
-                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+                                         FLAG_SDMX_DATAFLOW_ID, self.name)
         dataset_prefix = self._config.run.dataset_prefix
         working_dir = Path(self._config.run.working_dir).resolve()
         output_path = working_dir / f"{dataset_prefix}_data.csv"
@@ -168,11 +162,11 @@ def _prepare_command(self) -> _StepContext:
         if self._context:
             return self._context
         endpoint = _require_config_field(self._config.sdmx.endpoint,
-                                         _FLAG_SDMX_ENDPOINT, self.name)
+                                         FLAG_SDMX_ENDPOINT, self.name)
         agency = _require_config_field(self._config.sdmx.agency,
-                                       _FLAG_SDMX_AGENCY, self.name)
+                                       FLAG_SDMX_AGENCY, self.name)
         dataflow = _require_config_field(self._config.sdmx.dataflow.id,
-                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+                                         FLAG_SDMX_DATAFLOW_ID, self.name)
         dataset_prefix = self._config.run.dataset_prefix
         working_dir = Path(self._config.run.working_dir).resolve()
         output_path = working_dir / f"{dataset_prefix}_metadata.xml"
@@ -425,11 +419,11 @@ def _prepare_command(self) -> _StepContext:
                          f"{dataset_prefix}_config.json")
 
         endpoint = _require_config_field(self._config.sdmx.endpoint,
-                                         _FLAG_SDMX_ENDPOINT, self.name)
+                                         FLAG_SDMX_ENDPOINT, self.name)
         agency = _require_config_field(self._config.sdmx.agency,
-                                       _FLAG_SDMX_AGENCY, self.name)
+                                       FLAG_SDMX_AGENCY, self.name)
         dataflow = _require_config_field(self._config.sdmx.dataflow.id,
-                                         _FLAG_SDMX_DATAFLOW_ID, self.name)
+                                         FLAG_SDMX_DATAFLOW_ID, self.name)
 
         dataset_url = (f"{endpoint.rstrip('/')}/data/"
                        f"{agency},{dataflow},")

From e9105a9829d9702f8846066301e71b1f46ceac13 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 8 Dec 2025 05:19:00 +0000
Subject: [PATCH 52/54] fix: support string version identifiers

---
 tools/agentic_import/pipeline.py              |  8 +--
 tools/agentic_import/pipeline_test.py         |  6 +--
 .../sdmx_import_pipeline_test.py              | 50 +++++++++----------
 tools/agentic_import/sdmx_pipeline_builder.py |  7 ++-
 tools/agentic_import/sdmx_pipeline_steps.py   | 16 +++---
 tools/agentic_import/state_handler.py         |  2 +-
 6 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index 43d009c3b6..d33abb8534 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -37,8 +37,8 @@ def name(self) -> str:
 
     @property
     @abc.abstractmethod
-    def version(self) -> int:
-        """Version used for invalidation decisions."""
+    def version(self) -> str:
+        """Version string used for invalidation decisions."""
 
     @abc.abstractmethod
     def run(self) -> None:
@@ -48,7 +48,7 @@ def run(self) -> None:
 class BaseStep(Step, abc.ABC):
     """Helper base class that stores mandatory metadata."""
 
-    def __init__(self, *, name: str, version: int) -> None:
+    def __init__(self, *, name: str, version: str) -> None:
         if not name:
             raise ValueError("step requires a name")
         self._name = name
@@ -59,7 +59,7 @@ def name(self) -> str:
         return self._name
 
     @property
-    def version(self) -> int:
+    def version(self) -> str:
         return self._version
 
 
diff --git a/tools/agentic_import/pipeline_test.py b/tools/agentic_import/pipeline_test.py
index 52944546f1..2abfc4c45c 100644
--- a/tools/agentic_import/pipeline_test.py
+++ b/tools/agentic_import/pipeline_test.py
@@ -31,7 +31,7 @@
 class _TrackingStep(BaseStep):
 
     def __init__(self, name: str, events: list[str]) -> None:
-        super().__init__(name=name, version=1)
+        super().__init__(name=name, version="1")
         self._events = events
         self.executed = False
 
@@ -42,7 +42,7 @@ def run(self) -> None:
 
 class _FailingStep(BaseStep):
 
-    def __init__(self, *, name: str, version: int) -> None:
+    def __init__(self, *, name: str, version: str) -> None:
         super().__init__(name=name, version=version)
 
     def run(self) -> None:
@@ -132,7 +132,7 @@ def after_step(self,
                 self.after_calls.append((name, error_name))
 
         callback = RecordingCallback()
-        pipeline = Pipeline(steps=[_FailingStep(name="fail-step", version=1)])
+        pipeline = Pipeline(steps=[_FailingStep(name="fail-step", version="1")])
 
         with self.assertRaises(ValueError):
             PipelineRunner(RunnerConfig()).run(pipeline, callback)
diff --git a/tools/agentic_import/sdmx_import_pipeline_test.py b/tools/agentic_import/sdmx_import_pipeline_test.py
index 88b86b5721..367ee0364f 100644
--- a/tools/agentic_import/sdmx_import_pipeline_test.py
+++ b/tools/agentic_import/sdmx_import_pipeline_test.py
@@ -77,7 +77,7 @@ def __call__(self) -> datetime:
 class _RecordingStep(SdmxStep):
 
     def __init__(self, name: str, *, should_fail: bool = False) -> None:
-        super().__init__(name=name, version=1, config=_TEST_CONFIG)
+        super().__init__(name=name, version="1", config=_TEST_CONFIG)
         self._should_fail = should_fail
 
     def run(self) -> None:
@@ -90,7 +90,7 @@ def dry_run(self) -> None:
 
 class _VersionedStep(SdmxStep):
 
-    def __init__(self, name: str, version: int) -> None:
+    def __init__(self, name: str, version: str) -> None:
         super().__init__(name=name, version=version, config=_TEST_CONFIG)
 
     def run(self) -> None:
@@ -188,7 +188,7 @@ def test_abort_skips_state_persistence(self) -> None:
                 "updated_at_ts": 1,
                 "steps": {
                     "existing.step": {
-                        "version": 1,
+                        "version": "1",
                         "status": "succeeded",
                         "started_at": "2025-01-01T00:00:00Z",
                         "started_at_ts": 0,
@@ -207,7 +207,7 @@ class _AbortStep(SdmxStep):
 
                 def __init__(self) -> None:
                     super().__init__(name="download.download-data",
-                                     version=1,
+                                     version="1",
                                      config=_TEST_CONFIG)
 
                 def run(self) -> None:
@@ -314,7 +314,7 @@ def _empty_state(self) -> PipelineState:
                              steps={})
 
     def _state_with(
-            self, versions: dict[str, tuple[int, str,
+            self, versions: dict[str, tuple[str, str,
                                             int | None]]) -> PipelineState:
         steps = {
             name:
@@ -379,12 +379,12 @@ def test_timestamp_chaining_triggers_next_step(self) -> None:
         newer = 2_000
         older = 1_000
         state = self._state_with({
-            "download-data": (1, "succeeded", newer),
-            "download-metadata": (1, "succeeded", older),
-            "create-sample": (1, "succeeded", older),
-            "create-schema-mapping": (1, "succeeded", older),
-            "process-full-data": (1, "succeeded", older),
-            "create-dc-config": (1, "succeeded", older),
+            "download-data": ("1", "succeeded", newer),
+            "download-metadata": ("1", "succeeded", older),
+            "create-sample": ("1", "succeeded", older),
+            "create-schema-mapping": ("1", "succeeded", older),
+            "process-full-data": ("1", "succeeded", older),
+            "create-dc-config": ("1", "succeeded", older),
         })
         cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND))
         names = self._names_from_builder(cfg, state=state)
@@ -412,8 +412,8 @@ def test_run_only_ignores_timestamp_chaining(self) -> None:
         newer = 4_000
         older = 3_000
         state = self._state_with({
-            "download-data": (1, "succeeded", newer),
-            "download-metadata": (1, "succeeded", older),
+            "download-data": ("1", "succeeded", newer),
+            "download-metadata": ("1", "succeeded", older),
         })
         cfg = PipelineConfig(
             run=RunConfig(command=_TEST_COMMAND, run_only="download-data"))
@@ -422,14 +422,14 @@ def test_run_only_ignores_timestamp_chaining(self) -> None:
 
     def test_version_bump_schedules_downstream(self) -> None:
         steps = [
-            _VersionedStep("download-data", 1),
-            _VersionedStep("process-full-data", 2),
-            _VersionedStep("create-dc-config", 1),
+            _VersionedStep("download-data", "1"),
+            _VersionedStep("process-full-data", "2"),
+            _VersionedStep("create-dc-config", "1"),
         ]
         state = self._state_with({
-            "download-data": (1, "succeeded", 1000),
-            "process-full-data": (1, "succeeded", 1000),
-            "create-dc-config": (1, "succeeded", 1000),
+            "download-data": ("1", "succeeded", 1000),
+            "process-full-data": ("1", "succeeded", 1000),
+            "create-dc-config": ("1", "succeeded", 1000),
         })
         cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND))
         names = self._names_from_builder(cfg, steps, state)
@@ -441,12 +441,12 @@ def test_version_bump_schedules_downstream(self) -> None:
 
     def test_incremental_records_skip_reasons(self) -> None:
         state = self._state_with({
-            "download-data": (1, "succeeded", 1_000),
-            "download-metadata": (1, "succeeded", 1_000),
-            "create-sample": (1, "succeeded", 1_000),
-            "create-schema-mapping": (1, "succeeded", 1_000),
-            "process-full-data": (1, "succeeded", 1_000),
-            "create-dc-config": (1, "succeeded", 1_000),
+            "download-data": ("1", "succeeded", 1_000),
+            "download-metadata": ("1", "succeeded", 1_000),
+            "create-sample": ("1", "succeeded", 1_000),
+            "create-schema-mapping": ("1", "succeeded", 1_000),
+            "process-full-data": ("1", "succeeded", 1_000),
+            "create-dc-config": ("1", "succeeded", 1_000),
         })
         cfg = PipelineConfig(run=RunConfig(command=_TEST_COMMAND))
         steps = build_steps(cfg)
diff --git a/tools/agentic_import/sdmx_pipeline_builder.py b/tools/agentic_import/sdmx_pipeline_builder.py
index 8d8c01ee64..e7588577e5 100644
--- a/tools/agentic_import/sdmx_pipeline_builder.py
+++ b/tools/agentic_import/sdmx_pipeline_builder.py
@@ -142,11 +142,10 @@ def _plan_incremental(self) -> tuple[list[Step], list[StepDecision]]:
                 needs_run = True
                 reason = (f"Previous run status was {prev_state.status}; "
                           "rerunning step")
-            elif prev_state.version < step.version:
+            elif prev_state.version != step.version:
                 needs_run = True
-                reason = (
-                    f"Step version increased from {prev_state.version} to "
-                    f"{step.version}; rerunning step")
+                reason = (f"Step version changed from {prev_state.version} to "
+                          f"{step.version}; rerunning step")
             else:
                 needs_run = False
                 reason = ("Previous run succeeded with same version; step is "
diff --git a/tools/agentic_import/sdmx_pipeline_steps.py b/tools/agentic_import/sdmx_pipeline_steps.py
index 53597972b8..9455a811c3 100644
--- a/tools/agentic_import/sdmx_pipeline_steps.py
+++ b/tools/agentic_import/sdmx_pipeline_steps.py
@@ -62,7 +62,7 @@ def _run_command(command: Sequence[str], *, verbose: bool) -> None:
 class SdmxStep(Step):
     """Base class for SDMX steps that carries immutable config and version."""
 
-    def __init__(self, *, name: str, version: int,
+    def __init__(self, *, name: str, version: str,
                  config: PipelineConfig) -> None:
         if not name:
             raise ValueError("step requires a name")
@@ -75,7 +75,7 @@ def name(self) -> str:
         return self._name
 
     @property
-    def version(self) -> int:
+    def version(self) -> str:
         return self._version
 
     @abc.abstractmethod
@@ -86,7 +86,7 @@ def dry_run(self) -> None:
 class DownloadDataStep(SdmxStep):
     """Downloads SDMX data payloads."""
 
-    VERSION = 1
+    VERSION = "1"
 
     @dataclass(frozen=True)
     class _StepContext:
@@ -147,7 +147,7 @@ def dry_run(self) -> None:
 class DownloadMetadataStep(SdmxStep):
     """Downloads SDMX metadata payloads."""
 
-    VERSION = 1
+    VERSION = "1"
 
     @dataclass(frozen=True)
     class _StepContext:
@@ -204,7 +204,7 @@ def dry_run(self) -> None:
 class CreateSampleStep(SdmxStep):
     """Creates a sample dataset from downloaded data."""
 
-    VERSION = 1
+    VERSION = "1"
 
     @dataclass(frozen=True)
     class _StepContext:
@@ -258,7 +258,7 @@ def dry_run(self) -> None:
 class CreateSchemaMapStep(SdmxStep):
     """Builds schema mappings for transformed data."""
 
-    VERSION = 1
+    VERSION = "1"
 
     @dataclass(frozen=True)
     class _StepContext:
@@ -323,7 +323,7 @@ def dry_run(self) -> None:
 class ProcessFullDataStep(SdmxStep):
     """Processes full SDMX data into DC artifacts."""
 
-    VERSION = 1
+    VERSION = "1"
 
     RUN_OUTPUT_COLUMNS: ClassVar[str] = (
         "observationDate,observationAbout,variableMeasured,value,"
@@ -397,7 +397,7 @@ def dry_run(self) -> None:
 class CreateDcConfigStep(SdmxStep):
     """Generates Datacommons configuration artifacts."""
 
-    VERSION = 1
+    VERSION = "1"
 
     @dataclass(frozen=True)
     class _StepContext:
diff --git a/tools/agentic_import/state_handler.py b/tools/agentic_import/state_handler.py
index 31dabccc1f..ea1d593197 100644
--- a/tools/agentic_import/state_handler.py
+++ b/tools/agentic_import/state_handler.py
@@ -32,7 +32,7 @@
 @dataclass_json
 @dataclass
 class StepState:
-    version: int
+    version: str
     status: str
     started_at: str
     ended_at: str

From a2b81bdecb1211fc1d53cfcc91837daad9a5ebcb Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 8 Dec 2025 05:43:01 +0000
Subject: [PATCH 53/54] removed del

---
 tools/agentic_import/pipeline.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index d33abb8534..bbccc60862 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -73,11 +73,9 @@ class PipelineCallback:
 
     def before_step(self, step: Step) -> None:
         """Called immediately before `step.run()`; raising an error skips execution."""
-        del step
 
     def after_step(self, step: Step, *, error: Exception | None = None) -> None:
         """Runs once per step after `step.run()` succeeds or raises."""
-        del step, error
 
 
 class CompositeCallback(PipelineCallback):

From 17f43bfaaeca4c8e0507e4bf703872db6e0461ab Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Wed, 17 Dec 2025 09:14:55 +0000
Subject: [PATCH 54/54] add comment

---
 tools/agentic_import/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/agentic_import/pipeline.py b/tools/agentic_import/pipeline.py
index bbccc60862..57b234da98 100644
--- a/tools/agentic_import/pipeline.py
+++ b/tools/agentic_import/pipeline.py
@@ -42,7 +42,7 @@ def version(self) -> str:
 
     @abc.abstractmethod
     def run(self) -> None:
-        """Execute the step."""
+        """Execute the step. Raise an exception to signal failure."""
 
 
 class BaseStep(Step, abc.ABC):