From b66a49dc9301ce1c59e4bcf9b0425761d3acdbe7 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 29 Apr 2026 08:45:12 +0200
Subject: [PATCH 01/18] Add data downloading functionality and gdown dependency
 for test setup

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 25 +++++++++++++++++++++----
 pyproject.toml                    |  1 +
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 90da468a..9ca52bb1 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -6,6 +6,8 @@
 import yaml
 import urllib.request
 import shutil
+import zipfile
+import gdown
 
 
 def execute_and_live_output(cmd) -> None:
@@ -106,13 +108,28 @@ def test_train(cleanup_test_artifacts):
     data_dir = "data_out"
 
     if not os.path.exists(data_dir) or not os.listdir(data_dir):
-        print("Data directory not found or empty, generating data...")
+        print("Data directory not found or empty, downloading pre-generated data...")
 
-        config_path = prepare_config()
+        # --- Dataset generation (commented out, using pre-generated data instead) ---
+        # config_path = prepare_config()
+        # execute_and_live_output(f"gridfm_datakit generate {config_path}")
+        # -------------------------------------------------------------------------
 
-        execute_and_live_output(f"gridfm_datakit generate {config_path}")
+        gdrive_file_id = "1NtE_4Fn3-1_BNWidZVFeSTfXf3-B50Yr"
+        zip_filename = "case14_ieee.10000_scenarios_2_variants.zip"
+        gdrive_url = f"https://drive.google.com/uc?id={gdrive_file_id}"
+
+        print(f"Downloading {zip_filename} from Google Drive...")
+        gdown.download(gdrive_url, zip_filename, quiet=False)
+
+        print(f"Extracting {zip_filename}...")
+        with zipfile.ZipFile(zip_filename, "r") as zf:
+            zf.extractall(".")
+
+        os.remove(zip_filename)
+        print(f"Data extracted to '{data_dir}'.")
     else:
-        print(f"Data directory '{data_dir}' already exists, skipping generation.")
+        print(f"Data directory '{data_dir}' already exists, skipping download.")
 
     training_config_path = prepare_training_config()
 
diff --git a/pyproject.toml b/pyproject.toml
index 2b6c523c..a6a1e2a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ dependencies = [
     "lightning",
     "seaborn",
     "urllib3>=2.6.0",
+    "gdown>=6.0.0",
     "gridfm-datakit>=1.0.2",
 ]
 

From 8b99b82d22c47a7fbe716faa0fa6cc9bffbef2ca Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 29 Apr 2026 09:10:18 +0200
Subject: [PATCH 02/18] Refactor test setup by removing unused config
 preparation function and related code

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 37 -------------------------------
 1 file changed, 37 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 9ca52bb1..fee6ff5e 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -4,7 +4,6 @@
 import glob
 import pandas as pd
 import yaml
-import urllib.request
 import shutil
 import zipfile
 import gdown
@@ -14,37 +13,6 @@ def execute_and_live_output(cmd) -> None:
     subprocess.run(cmd, text=True, shell=True, check=True)
 
 
-def prepare_config():
-    """
-    Download default.yaml from gridfm-datakit repo and modify it with test parameters.
-    """
-    config_url = "https://raw.githubusercontent.com/gridfm/gridfm-datakit/refs/heads/main/scripts/config/default.yaml"
-    config_path = "integrationtests/default.yaml"
-
-    print(f"Downloading config from {config_url}...")
-    with urllib.request.urlopen(config_url) as response:
-        config_content = response.read().decode("utf-8")
-
-    config = yaml.safe_load(config_content)
-
-    config["network"]["name"] = "case14_ieee"
-    config["load"]["scenarios"] = 10000
-    config["topology_perturbation"]["n_topology_variants"] = 2
-
-    with open(config_path, "w") as f:
-        yaml.dump(config, f, default_flow_style=False, sort_keys=False)
-
-    print(f"Config prepared at {config_path} with:")
-    print(f"  - network.name: {config['network']['name']}")
-    print(f"  - load.scenarios: {config['load']['scenarios']}")
-    print(
-        f"  - topology_perturbation.n_topology_variants: "
-        f"{config['topology_perturbation']['n_topology_variants']}",
-    )
-
-    return config_path
-
-
 def prepare_training_config():
     """
     Modify the training config to set epochs to 2 for testing.
@@ -110,11 +78,6 @@ def test_train(cleanup_test_artifacts):
     if not os.path.exists(data_dir) or not os.listdir(data_dir):
         print("Data directory not found or empty, downloading pre-generated data...")
 
-        # --- Dataset generation (commented out, using pre-generated data instead) ---
-        # config_path = prepare_config()
-        # execute_and_live_output(f"gridfm_datakit generate {config_path}")
-        # -------------------------------------------------------------------------
-
         gdrive_file_id = "1NtE_4Fn3-1_BNWidZVFeSTfXf3-B50Yr"
         zip_filename = "case14_ieee.10000_scenarios_2_variants.zip"
         gdrive_url = f"https://drive.google.com/uc?id={gdrive_file_id}"

From 9be32c4ddda141239d99db5039b2637878de9c4d Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 29 Apr 2026 09:22:25 +0200
Subject: [PATCH 03/18] Add test data generation functions for power-flow and
 optimal power-flow scenarios

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/generate_test_data.py | 72 ++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 integrationtests/generate_test_data.py

diff --git a/integrationtests/generate_test_data.py b/integrationtests/generate_test_data.py
new file mode 100644
index 00000000..913f00d5
--- /dev/null
+++ b/integrationtests/generate_test_data.py
@@ -0,0 +1,72 @@
+import urllib.request
+import yaml
+import subprocess
+
+
+def execute_and_live_output(cmd) -> None:
+    subprocess.run(cmd, text=True, shell=True, check=True)
+
+
+def _base_config() -> dict:
+    """
+    Download the default config from gridfm-datakit and apply common test parameters.
+    """
+    config_url = (
+        "https://raw.githubusercontent.com/gridfm/gridfm-datakit/refs/heads/main"
+        "/scripts/config/default.yaml"
+    )
+
+    print(f"Downloading config from {config_url}...")
+    with urllib.request.urlopen(config_url) as response:
+        config_content = response.read().decode("utf-8")
+
+    config = yaml.safe_load(config_content)
+
+    config["network"]["name"] = "case14_ieee"
+    config["load"]["scenarios"] = 10000
+    config["topology_perturbation"]["n_topology_variants"] = 2
+
+    return config
+
+
+def generate_pf_test_data(config_path: str = "integrationtests/default_pf.yaml") -> None:
+    """
+    Generate power-flow (PF) test data for case14_ieee with 10 000 scenarios
+    and 2 topology variants.
+    """
+    config = _base_config()
+
+    with open(config_path, "w") as f:
+        yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+
+    print(f"PF config written to {config_path}")
+    print(f"  network.name                          : {config['network']['name']}")
+    print(f"  load.scenarios                        : {config['load']['scenarios']}")
+    print(f"  topology_perturbation.n_topology_variants: {config['topology_perturbation']['n_topology_variants']}")
+
+    execute_and_live_output(f"gridfm_datakit generate {config_path}")
+
+
+def generate_opf_test_data(config_path: str = "integrationtests/default_opf.yaml") -> None:
+    """
+    Generate optimal power-flow (OPF) test data for case14_ieee with 10 000 scenarios
+    and 2 topology variants.
+    """
+    config = _base_config()
+    config.setdefault("settings", {})["mode"] = "opf"
+
+    with open(config_path, "w") as f:
+        yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+
+    print(f"OPF config written to {config_path}")
+    print(f"  network.name                          : {config['network']['name']}")
+    print(f"  load.scenarios                        : {config['load']['scenarios']}")
+    print(f"  topology_perturbation.n_topology_variants: {config['topology_perturbation']['n_topology_variants']}")
+    print(f"  settings.mode                         : {config['settings']['mode']}")
+
+    execute_and_live_output(f"gridfm_datakit generate {config_path}")
+
+
+if __name__ == "__main__":
+    generate_pf_test_data()
+    generate_opf_test_data()

From a76b1cccdd92bb8412962cbb92d24856d9b87bde Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 29 Apr 2026 11:59:00 +0200
Subject: [PATCH 04/18] Add OPF integration test with separate data/log dirs
 and metric validation

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 127 +++++++++++++++++++++++++++++-
 1 file changed, 126 insertions(+), 1 deletion(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index fee6ff5e..e291cd17 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -7,6 +7,7 @@
 import shutil
 import zipfile
 import gdown
+import tempfile
 
 
 def execute_and_live_output(cmd) -> None:
@@ -15,7 +16,7 @@ def execute_and_live_output(cmd) -> None:
 
 def prepare_training_config():
     """
-    Modify the training config to set epochs to 2 for testing.
+    Modify the PF training config to set epochs to 2 for testing.
     """
     config_path = "examples/config/HGNS_PF_datakit_case14.yaml"
 
@@ -35,6 +36,28 @@ def prepare_training_config():
     return config_path
 
 
+def prepare_opf_training_config():
+    """
+    Modify the OPF training config to set epochs to 2 for testing.
+    """
+    config_path = "examples/config/HGNS_OPF_datakit_case14.yaml"
+
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+
+    if "training" not in config:
+        config["training"] = {}
+
+    config["training"]["epochs"] = 2
+
+    with open(config_path, "w") as f:
+        yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+
+    print(f"OPF training config updated: epochs set to {config['training']['epochs']}")
+
+    return config_path
+
+
 @pytest.fixture
 def cleanup_test_artifacts():
     """
@@ -138,3 +161,105 @@ def test_train(cleanup_test_artifacts):
     )
 
     print(f"PBE Mean value {pbe_mean_value} is within acceptable range [1.1, 2.9]")
+
+
+@pytest.fixture
+def cleanup_opf_test_artifacts():
+    """
+    Remove generated artifacts after the OPF test.
+    """
+    yield
+
+    for d in ["data_out_opf", "logs_opf"]:
+        if os.path.exists(d):
+            shutil.rmtree(d, ignore_errors=True)
+
+
+def test_train_opf(cleanup_opf_test_artifacts):
+    """
+    Integration test for OPF data download and gridfm-graphkit OPF training.
+
+    Steps:
+    1. Download pre-generated OPF power grid data from Google Drive
+    2. Train a model using gridfm-graphkit with the OPF config
+    3. Validate OPF-specific metrics
+    """
+
+    opf_data_dir = "data_out_opf"
+
+    if not os.path.exists(opf_data_dir) or not os.listdir(opf_data_dir):
+        print("OPF data directory not found or empty, downloading pre-generated data...")
+
+        gdrive_file_id = "1Ow4SYGAYQ4mZad4yNKYmXvG24BbPs8Pj"
+        zip_filename = "case14_ieee.10000_scenarios_2_variants_opf.zip"
+        gdrive_url = f"https://drive.google.com/uc?id={gdrive_file_id}"
+
+        print(f"Downloading {zip_filename} from Google Drive...")
+        gdown.download(gdrive_url, zip_filename, quiet=False)
+
+        print(f"Extracting {zip_filename}...")
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with zipfile.ZipFile(zip_filename, "r") as zf:
+                zf.extractall(tmpdir)
+            shutil.move(os.path.join(tmpdir, "data_out"), opf_data_dir)
+
+        os.remove(zip_filename)
+        print(f"OPF data extracted to '{opf_data_dir}'.")
+    else:
+        print(f"OPF data directory '{opf_data_dir}' already exists, skipping download.")
+
+    training_config_path = prepare_opf_training_config()
+
+    execute_and_live_output(
+        f"gridfm_graphkit train "
+        f"--config {training_config_path} "
+        f"--data_path {opf_data_dir}/ "
+        f"--exp_name exp_opf "
+        f"--run_name run1 "
+        f"--log_dir logs_opf",
+    )
+
+    log_base = "logs_opf"
+
+    exp_dirs = glob.glob(os.path.join(log_base, "*"))
+    assert len(exp_dirs) > 0, "No experiment directories found in logs_opf/"
+
+    latest_exp_dir = sorted(exp_dirs, key=os.path.getctime)[-1]
+
+    run_dirs = glob.glob(os.path.join(latest_exp_dir, "*"))
+    assert len(run_dirs) > 0, f"No run directories found in {latest_exp_dir}"
+
+    latest_run_dir = max(run_dirs, key=os.path.getmtime)
+
+    metrics_file = os.path.join(
+        latest_run_dir,
+        "artifacts",
+        "test",
+        "case14_ieee_metrics.csv",
+    )
+
+    assert os.path.exists(metrics_file), f"Metrics file not found: {metrics_file}"
+
+    df = pd.read_csv(metrics_file)
+    metrics = dict(zip(df["Metric"], df["Value"].astype(float)))
+
+    checks = {
+        "Avg. active res. (MW)": (0.0, 2.0),
+        "Avg. reactive res. (MVar)": (0.0, 2.0),
+        "RMSE PG generators (MW)": (0.0, 50.0),
+        "Mean optimality gap (%)": (0.0, 10.0),
+        "Mean branch thermal violation from (MVA)": (0.0, 5.0),
+        "Mean branch thermal violation to (MVA)": (0.0, 5.0),
+        "Mean branch angle difference violation (radians)": (0.0, 1.0),
+        "Mean Qg violation PV buses": (0.0, 5.0),
+        "Mean Qg violation REF buses": (0.0, 5.0),
+        "Mean Qg violation": (0.0, 5.0),
+    }
+
+    for metric_name, (lo, hi) in checks.items():
+        assert metric_name in metrics, f"Metric '{metric_name}' not found in CSV"
+        value = metrics[metric_name]
+        assert lo <= value <= hi, (
+            f"Metric '{metric_name}' value {value} is outside acceptable range [{lo}, {hi}]"
+        )
+        print(f"{metric_name}: {value} is within [{lo}, {hi}]")

From bba356c39f1e6a139e76a8fa4b3d31ebbbcfd659 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Thu, 30 Apr 2026 10:31:42 +0200
Subject: [PATCH 05/18] fix dateset

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index e291cd17..42f17dce 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -190,7 +190,7 @@ def test_train_opf(cleanup_opf_test_artifacts):
     if not os.path.exists(opf_data_dir) or not os.listdir(opf_data_dir):
         print("OPF data directory not found or empty, downloading pre-generated data...")
 
-        gdrive_file_id = "1Ow4SYGAYQ4mZad4yNKYmXvG24BbPs8Pj"
+        gdrive_file_id = "1-pgoUqVcTfqZCOFrpMoL4VgPkHrjq3z9"
         zip_filename = "case14_ieee.10000_scenarios_2_variants_opf.zip"
         gdrive_url = f"https://drive.google.com/uc?id={gdrive_file_id}"
 

From 2856323f387cccd982f82dc14efc07b6f27b7dad Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Thu, 30 Apr 2026 10:33:59 +0200
Subject: [PATCH 06/18] fix test data generator

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/generate_test_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integrationtests/generate_test_data.py b/integrationtests/generate_test_data.py
index 913f00d5..fba2624b 100644
--- a/integrationtests/generate_test_data.py
+++ b/integrationtests/generate_test_data.py
@@ -53,7 +53,7 @@ def generate_opf_test_data(config_path: str = "integrationtests/default_opf.yaml
     and 2 topology variants.
     """
     config = _base_config()
-    config.setdefault("settings", {})["mode"] = "opf"
+    config["settings"]["mode"] = "opf"
 
     with open(config_path, "w") as f:
         yaml.dump(config, f, default_flow_style=False, sort_keys=False)
@@ -68,5 +68,5 @@ def generate_opf_test_data(config_path: str = "integrationtests/default_opf.yaml
 
 
 if __name__ == "__main__":
-    generate_pf_test_data()
+    #generate_pf_test_data()
     generate_opf_test_data()

From e57abc041953f1c3920e0625d3174d9c8e839a14 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 6 May 2026 19:34:41 +0200
Subject: [PATCH 07/18] Update Google Drive file ID for OPF data download in
 test_train_opf

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 42f17dce..33483c2f 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -190,7 +190,7 @@ def test_train_opf(cleanup_opf_test_artifacts):
     if not os.path.exists(opf_data_dir) or not os.listdir(opf_data_dir):
         print("OPF data directory not found or empty, downloading pre-generated data...")
 
-        gdrive_file_id = "1-pgoUqVcTfqZCOFrpMoL4VgPkHrjq3z9"
+        gdrive_file_id = "1p5f5mRvmBQh8lZpIyWWbTbU42aHAIsdT"
         zip_filename = "case14_ieee.10000_scenarios_2_variants_opf.zip"
         gdrive_url = f"https://drive.google.com/uc?id={gdrive_file_id}"
 

From c2c068476d51c606322f6e06f15dbe6ddf0a77d0 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Thu, 7 May 2026 08:32:36 +0200
Subject: [PATCH 08/18] Add calibration option and enhance training tests with
 metrics collection

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/conftest.py      |  18 +++
 integrationtests/test_base_set.py | 179 +++++++++++++++++-------------
 2 files changed, 122 insertions(+), 75 deletions(-)
 create mode 100644 integrationtests/conftest.py

diff --git a/integrationtests/conftest.py b/integrationtests/conftest.py
new file mode 100644
index 00000000..6ad90815
--- /dev/null
+++ b/integrationtests/conftest.py
@@ -0,0 +1,18 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--calibrate",
+        type=int,
+        default=0,
+        help="Run training N times to collect metric mean/std for range calibration. "
+        "Skips metric range assertions. Example: pytest --calibrate 5",
+    )
+
+
+@pytest.fixture
+def calibrate_runs(request):
+    """Number of calibration runs requested via --calibrate (0 = normal test mode)."""
+    return request.config.getoption("--calibrate")
+
diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 33483c2f..d2022d3e 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -8,15 +8,44 @@
 import zipfile
 import gdown
 import tempfile
+import statistics
 
 
 def execute_and_live_output(cmd) -> None:
     subprocess.run(cmd, text=True, shell=True, check=True)
 
 
+def collect_metrics_from_log(log_base: str, metric_keys: list) -> dict:
+    """Find the latest run's metrics CSV and return a dict of {metric: value}."""
+    exp_dirs = glob.glob(os.path.join(log_base, "*"))
+    assert len(exp_dirs) > 0, f"No experiment directories found in {log_base}/"
+    latest_exp_dir = sorted(exp_dirs, key=os.path.getctime)[-1]
+    run_dirs = glob.glob(os.path.join(latest_exp_dir, "*"))
+    assert len(run_dirs) > 0, f"No run directories found in {latest_exp_dir}"
+    latest_run_dir = max(run_dirs, key=os.path.getmtime)
+    metrics_file = os.path.join(latest_run_dir, "artifacts", "test", "case14_ieee_metrics.csv")
+    assert os.path.exists(metrics_file), f"Metrics file not found: {metrics_file}"
+    df = pd.read_csv(metrics_file)
+    return dict(zip(df["Metric"], df["Value"].astype(float)))
+
+
+def print_calibration_stats(all_runs: list, metric_keys: list) -> None:
+    """Print mean +/- std across calibration runs for each metric."""
+    print("\n===== Calibration Results =====")
+    for key in metric_keys:
+        values = [run[key] for run in all_runs if key in run]
+        if not values:
+            print(f"  {key}: no data")
+            continue
+        mean = statistics.mean(values)
+        std = statistics.stdev(values) if len(values) > 1 else 0.0
+        print(f"  {key}: mean={mean:.4f}  std={std:.4f}  min={min(values):.4f}  max={max(values):.4f}")
+    print("==============================\n")
+
+
 def prepare_training_config():
     """
-    Modify the PF training config to set epochs to 2 for testing.
+    Modify the PF training config to set epochs to 20 and hidden_size to 12 for testing.
     """
     config_path = "examples/config/HGNS_PF_datakit_case14.yaml"
 
@@ -25,20 +54,23 @@ def prepare_training_config():
 
     if "training" not in config:
         config["training"] = {}
+    if "model" not in config:
+        config["model"] = {}
 
-    config["training"]["epochs"] = 2
+    config["training"]["epochs"] = 20
+    config["model"]["hidden_size"] = 12
 
     with open(config_path, "w") as f:
         yaml.dump(config, f, default_flow_style=False, sort_keys=False)
 
-    print(f"Training config updated: epochs set to {config['training']['epochs']}")
+    print(f"Training config updated: epochs set to {config['training']['epochs']}, hidden_size set to {config['model']['hidden_size']}")
 
     return config_path
 
 
 def prepare_opf_training_config():
     """
-    Modify the OPF training config to set epochs to 2 for testing.
+    Modify the OPF training config to set epochs to 20 and hidden_size to 12 for testing.
     """
     config_path = "examples/config/HGNS_OPF_datakit_case14.yaml"
 
@@ -47,13 +79,16 @@ def prepare_opf_training_config():
 
     if "training" not in config:
         config["training"] = {}
+    if "model" not in config:
+        config["model"] = {}
 
-    config["training"]["epochs"] = 2
+    config["training"]["epochs"] = 20
+    config["model"]["hidden_size"] = 12
 
     with open(config_path, "w") as f:
         yaml.dump(config, f, default_flow_style=False, sort_keys=False)
 
-    print(f"OPF training config updated: epochs set to {config['training']['epochs']}")
+    print(f"OPF training config updated: epochs set to {config['training']['epochs']}, hidden_size set to {config['model']['hidden_size']}")
 
     return config_path
 
@@ -86,7 +121,7 @@ def cleanup_test_artifacts():
             shutil.rmtree(d, ignore_errors=True)
 
 
-def test_train(cleanup_test_artifacts):
+def test_train(cleanup_test_artifacts, calibrate_runs):
     """
     Integration test for gridfm-datakit data generation and gridfm-graphkit training.
 
@@ -94,8 +129,14 @@ def test_train(cleanup_test_artifacts):
     1. Generate power grid data using gridfm-datakit
     2. Train a model using gridfm-graphkit
     3. Validate the PBE Mean metric
+
+    Pass --calibrate N to pytest (e.g. pytest --calibrate 5) to run N training passes
+    and print metric mean/std without asserting range bounds.
     """
 
+    n_runs = max(calibrate_runs, 1)
+    pf_metric_keys = ["PBE Mean"]
+
     data_dir = "data_out"
 
     if not os.path.exists(data_dir) or not os.listdir(data_dir):
@@ -118,43 +159,27 @@ def test_train(cleanup_test_artifacts):
         print(f"Data directory '{data_dir}' already exists, skipping download.")
 
     training_config_path = prepare_training_config()
+    all_runs = []
+
+    for run_i in range(n_runs):
+        print(f"\n--- PF Training run {run_i + 1}/{n_runs} ---")
+        execute_and_live_output(
+            f"gridfm_graphkit train "
+            f"--config {training_config_path} "
+            f"--data_path data_out/ "
+            f"--exp_name exp1 "
+            f"--run_name run{run_i + 1} "
+            f"--log_dir logs",
+        )
+        metrics = collect_metrics_from_log("logs", pf_metric_keys)
+        all_runs.append(metrics)
 
-    execute_and_live_output(
-        f"gridfm_graphkit train "
-        f"--config {training_config_path} "
-        f"--data_path data_out/ "
-        f"--exp_name exp1 "
-        f"--run_name run1 "
-        f"--log_dir logs",
-    )
-
-    log_base = "logs"
-
-    exp_dirs = glob.glob(os.path.join(log_base, "*"))
-    assert len(exp_dirs) > 0, "No experiment directories found in logs/"
-
-    latest_exp_dir = sorted(exp_dirs, key=os.path.getctime)[-1]
-
-    run_dirs = glob.glob(os.path.join(latest_exp_dir, "*"))
-    assert len(run_dirs) > 0, f"No run directories found in {latest_exp_dir}"
-
-    latest_run_dir = max(run_dirs, key=os.path.getmtime)
-
-    metrics_file = os.path.join(
-        latest_run_dir,
-        "artifacts",
-        "test",
-        "case14_ieee_metrics.csv",
-    )
-
-    assert os.path.exists(metrics_file), f"Metrics file not found: {metrics_file}"
-
-    df = pd.read_csv(metrics_file)
-
-    pbe_mean_row = df[df["Metric"] == "PBE Mean"]
-    assert len(pbe_mean_row) > 0, "PBE Mean metric not found in CSV"
+    if calibrate_runs > 0:
+        print_calibration_stats(all_runs, pf_metric_keys)
+        return
 
-    pbe_mean_value = float(pbe_mean_row.iloc[0]["Value"])
+    metrics = all_runs[0]
+    pbe_mean_value = metrics["PBE Mean"]
 
     assert 1.1 <= pbe_mean_value <= 2.9, (
         f"PBE Mean value {pbe_mean_value} is outside acceptable range [1.1, 2.9]"
@@ -175,7 +200,7 @@ def cleanup_opf_test_artifacts():
             shutil.rmtree(d, ignore_errors=True)
 
 
-def test_train_opf(cleanup_opf_test_artifacts):
+def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs):
     """
     Integration test for OPF data download and gridfm-graphkit OPF training.
 
@@ -183,14 +208,31 @@ def test_train_opf(cleanup_opf_test_artifacts):
     1. Download pre-generated OPF power grid data from Google Drive
     2. Train a model using gridfm-graphkit with the OPF config
     3. Validate OPF-specific metrics
+
+    Pass --calibrate N to pytest (e.g. pytest --calibrate 5) to run N training passes
+    and print metric mean/std without asserting range bounds.
     """
 
+    n_runs = max(calibrate_runs, 1)
+    opf_metric_keys = [
+        "Avg. active res. (MW)",
+        "Avg. reactive res. (MVar)",
+        "RMSE PG generators (MW)",
+        "Mean optimality gap (%)",
+        "Mean branch thermal violation from (MVA)",
+        "Mean branch thermal violation to (MVA)",
+        "Mean branch angle difference violation (radians)",
+        "Mean Qg violation PV buses",
+        "Mean Qg violation REF buses",
+        "Mean Qg violation",
+    ]
+
     opf_data_dir = "data_out_opf"
 
     if not os.path.exists(opf_data_dir) or not os.listdir(opf_data_dir):
         print("OPF data directory not found or empty, downloading pre-generated data...")
 
-        gdrive_file_id = "1p5f5mRvmBQh8lZpIyWWbTbU42aHAIsdT"
+        gdrive_file_id = "1p5f5mRvmBQh8lZpIyWWbTbU42aHAIsdT"  # pragma: allowlist secret
         zip_filename = "case14_ieee.10000_scenarios_2_variants_opf.zip"
         gdrive_url = f"https://drive.google.com/uc?id={gdrive_file_id}"
 
@@ -209,39 +251,26 @@ def test_train_opf(cleanup_opf_test_artifacts):
         print(f"OPF data directory '{opf_data_dir}' already exists, skipping download.")
 
     training_config_path = prepare_opf_training_config()
+    all_runs = []
+
+    for run_i in range(n_runs):
+        print(f"\n--- OPF Training run {run_i + 1}/{n_runs} ---")
+        execute_and_live_output(
+            f"gridfm_graphkit train "
+            f"--config {training_config_path} "
+            f"--data_path {opf_data_dir}/ "
+            f"--exp_name exp_opf "
+            f"--run_name run{run_i + 1} "
+            f"--log_dir logs_opf",
+        )
+        metrics = collect_metrics_from_log("logs_opf", opf_metric_keys)
+        all_runs.append(metrics)
 
-    execute_and_live_output(
-        f"gridfm_graphkit train "
-        f"--config {training_config_path} "
-        f"--data_path {opf_data_dir}/ "
-        f"--exp_name exp_opf "
-        f"--run_name run1 "
-        f"--log_dir logs_opf",
-    )
-
-    log_base = "logs_opf"
-
-    exp_dirs = glob.glob(os.path.join(log_base, "*"))
-    assert len(exp_dirs) > 0, "No experiment directories found in logs_opf/"
-
-    latest_exp_dir = sorted(exp_dirs, key=os.path.getctime)[-1]
-
-    run_dirs = glob.glob(os.path.join(latest_exp_dir, "*"))
-    assert len(run_dirs) > 0, f"No run directories found in {latest_exp_dir}"
-
-    latest_run_dir = max(run_dirs, key=os.path.getmtime)
-
-    metrics_file = os.path.join(
-        latest_run_dir,
-        "artifacts",
-        "test",
-        "case14_ieee_metrics.csv",
-    )
-
-    assert os.path.exists(metrics_file), f"Metrics file not found: {metrics_file}"
+    if calibrate_runs > 0:
+        print_calibration_stats(all_runs, opf_metric_keys)
+        return
 
-    df = pd.read_csv(metrics_file)
-    metrics = dict(zip(df["Metric"], df["Value"].astype(float)))
+    metrics = all_runs[0]
 
     checks = {
         "Avg. active res. (MW)": (0.0, 2.0),

From 2b45d8b09b8057ea9850c5ab6bcdb58352b11c1e Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 8 May 2026 10:59:36 +0200
Subject: [PATCH 09/18] Refactor calibration stats printing to include
 confidence intervals and use numpy for calculations

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 32 +++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index d2022d3e..c335eace 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -8,7 +8,8 @@
 import zipfile
 import gdown
 import tempfile
-import statistics
+import numpy as np
+from scipy import stats
 
 
 def execute_and_live_output(cmd) -> None:
@@ -30,17 +31,32 @@ def collect_metrics_from_log(log_base: str, metric_keys: list) -> dict:
 
 
 def print_calibration_stats(all_runs: list, metric_keys: list) -> None:
-    """Print mean +/- std across calibration runs for each metric."""
-    print("\n===== Calibration Results =====")
+    """
+    Print per-metric stats across calibration runs:
+      - std with Bessel's correction (ddof=1)
+      - two-sided 95% CI using Student-t distribution (t_{0.975, n-1})
+    """
+    n = len(all_runs)
+    t_crit = stats.t.ppf(0.975, df=max(n - 1, 1))  # t_{0.975, n-1}
+    col_w = max(len(k) for k in metric_keys) + 2
+    header = f"  {'Metric':<{col_w}}  {'Mean':>10}  {'Std(ddof=1)':>12}  {'CI 95% lo':>10}  {'CI 95% hi':>10}"
+    print(f"\n===== Calibration Results (n={n}, t_crit={t_crit:.4f}) =====")
+    print(header)
+    print("  " + "-" * (len(header) - 2))
     for key in metric_keys:
         values = [run[key] for run in all_runs if key in run]
         if not values:
-            print(f"  {key}: no data")
+            print(f"  {key:<{col_w}}  {'no data':>10}")
             continue
-        mean = statistics.mean(values)
-        std = statistics.stdev(values) if len(values) > 1 else 0.0
-        print(f"  {key}: mean={mean:.4f}  std={std:.4f}  min={min(values):.4f}  max={max(values):.4f}")
-    print("==============================\n")
+        arr = np.array(values, dtype=float)
+        mean = float(np.mean(arr))
+        std = float(np.std(arr, ddof=1)) if len(arr) > 1 else 0.0
+        me = t_crit * std / np.sqrt(len(arr))  # margin of error
+        lo, hi = mean - me, mean + me
+        print(
+            f"  {key:<{col_w}}  {mean:>10.4f}  {std:>12.4f}  {lo:>10.4f}  {hi:>10.4f}"
+        )
+    print("=" * (len(header)) + "\n")
 
 
 def prepare_training_config():

From c2a5255f4d79b8d9e889fe5e1a9baccf9bf55132 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 8 May 2026 15:08:11 +0200
Subject: [PATCH 10/18] Update test assertions to reflect 95% confidence
 intervals for PBE Mean and other metrics

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index c335eace..97fd8706 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -197,11 +197,11 @@ def test_train(cleanup_test_artifacts, calibrate_runs):
     metrics = all_runs[0]
     pbe_mean_value = metrics["PBE Mean"]
 
-    assert 1.1 <= pbe_mean_value <= 2.9, (
-        f"PBE Mean value {pbe_mean_value} is outside acceptable range [1.1, 2.9]"
+    assert 0.2239 <= pbe_mean_value <= 0.6535, (
+        f"PBE Mean value {pbe_mean_value} is outside 95% CI [0.2239, 0.6535]"
     )
 
-    print(f"PBE Mean value {pbe_mean_value} is within acceptable range [1.1, 2.9]")
+    print(f"PBE Mean value {pbe_mean_value} is within 95% CI [0.2239, 0.6535]")
 
 
 @pytest.fixture
@@ -289,22 +289,22 @@ def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs):
     metrics = all_runs[0]
 
     checks = {
-        "Avg. active res. (MW)": (0.0, 2.0),
-        "Avg. reactive res. (MVar)": (0.0, 2.0),
-        "RMSE PG generators (MW)": (0.0, 50.0),
-        "Mean optimality gap (%)": (0.0, 10.0),
-        "Mean branch thermal violation from (MVA)": (0.0, 5.0),
-        "Mean branch thermal violation to (MVA)": (0.0, 5.0),
-        "Mean branch angle difference violation (radians)": (0.0, 1.0),
-        "Mean Qg violation PV buses": (0.0, 5.0),
-        "Mean Qg violation REF buses": (0.0, 5.0),
-        "Mean Qg violation": (0.0, 5.0),
+        "Avg. active res. (MW)": (0.2652, 0.4217),
+        "Avg. reactive res. (MVar)": (0.1005, 0.1827),
+        "RMSE PG generators (MW)": (2.6918, 3.1753),
+        "Mean optimality gap (%)": (1.2375, 1.5709),
+        "Mean branch thermal violation from (MVA)": (0.0, 0.0),
+        "Mean branch thermal violation to (MVA)": (0.0, 0.0),
+        "Mean branch angle difference violation (radians)": (0.0, 0.0),
+        "Mean Qg violation PV buses": (-0.0008, 0.2543),
+        "Mean Qg violation REF buses": (0.0245, 0.4099),
+        "Mean Qg violation": (0.0108, 0.2801),
     }
 
     for metric_name, (lo, hi) in checks.items():
         assert metric_name in metrics, f"Metric '{metric_name}' not found in CSV"
         value = metrics[metric_name]
         assert lo <= value <= hi, (
-            f"Metric '{metric_name}' value {value} is outside acceptable range [{lo}, {hi}]"
+            f"Metric '{metric_name}' value {value} is outside 95% CI [{lo}, {hi}]"
         )
-        print(f"{metric_name}: {value} is within [{lo}, {hi}]")
+        print(f"{metric_name}: {value} is within 95% CI [{lo}, {hi}]")

From 68f49600ff421e1dfae4b43384737aec5b66edac Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 8 May 2026 23:18:32 +0200
Subject: [PATCH 11/18] Enhance calibration functionality by adding confidence
 interval option and updating related tests

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/conftest.py      | 13 +++++++++++++
 integrationtests/test_base_set.py | 27 ++++++++++++++++++---------
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/integrationtests/conftest.py b/integrationtests/conftest.py
index 6ad90815..5ec50984 100644
--- a/integrationtests/conftest.py
+++ b/integrationtests/conftest.py
@@ -9,6 +9,13 @@ def pytest_addoption(parser):
         help="Run training N times to collect metric mean/std for range calibration. "
         "Skips metric range assertions. Example: pytest --calibrate 5",
     )
+    parser.addoption(
+        "--ci",
+        type=float,
+        default=0.95,
+        help="Confidence interval level for calibration stats (default 0.95). "
+        "Example: pytest --calibrate 5 -s --ci 0.995",
+    )
 
 
 @pytest.fixture
@@ -16,3 +23,9 @@ def calibrate_runs(request):
     """Number of calibration runs requested via --calibrate (0 = normal test mode)."""
     return request.config.getoption("--calibrate")
 
+
+@pytest.fixture
+def ci_level(request):
+    """Confidence interval level requested via --ci (default 0.95)."""
+    return request.config.getoption("--ci")
+
diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 97fd8706..134d043d 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -30,17 +30,26 @@ def collect_metrics_from_log(log_base: str, metric_keys: list) -> dict:
     return dict(zip(df["Metric"], df["Value"].astype(float)))
 
 
-def print_calibration_stats(all_runs: list, metric_keys: list) -> None:
+def print_calibration_stats(all_runs: list, metric_keys: list, confidence_interval: float = 0.95) -> None:
     """
     Print per-metric stats across calibration runs:
       - std with Bessel's correction (ddof=1)
-      - two-sided 95% CI using Student-t distribution (t_{0.975, n-1})
+      - two-sided CI using Student-t distribution
+
+    Args:
+        all_runs: list of per-run metric dicts
+        metric_keys: list of metric names to report
+        confidence_interval: desired confidence level (default 0.95).
+            Example with higher confidence:
+                print_calibration_stats(all_runs, metric_keys, confidence_interval=0.995)
     """
     n = len(all_runs)
-    t_crit = stats.t.ppf(0.975, df=max(n - 1, 1))  # t_{0.975, n-1}
+    alpha_half = (1 + confidence_interval) / 2
+    t_crit = stats.t.ppf(alpha_half, df=max(n - 1, 1))
+    ci_pct = int(confidence_interval * 100)
     col_w = max(len(k) for k in metric_keys) + 2
-    header = f"  {'Metric':<{col_w}}  {'Mean':>10}  {'Std(ddof=1)':>12}  {'CI 95% lo':>10}  {'CI 95% hi':>10}"
-    print(f"\n===== Calibration Results (n={n}, t_crit={t_crit:.4f}) =====")
+    header = f"  {'Metric':<{col_w}}  {'Mean':>10}  {'Std(ddof=1)':>12}  {f'CI {ci_pct}% lo':>10}  {f'CI {ci_pct}% hi':>10}"
+    print(f"\n===== Calibration Results (n={n}, CI={confidence_interval}, t_crit={t_crit:.4f}) =====")
     print(header)
     print("  " + "-" * (len(header) - 2))
     for key in metric_keys:
@@ -137,7 +146,7 @@ def cleanup_test_artifacts():
             shutil.rmtree(d, ignore_errors=True)
 
 
-def test_train(cleanup_test_artifacts, calibrate_runs):
+def test_train(cleanup_test_artifacts, calibrate_runs, ci_level):
     """
     Integration test for gridfm-datakit data generation and gridfm-graphkit training.
 
@@ -191,7 +200,7 @@ def test_train(cleanup_test_artifacts, calibrate_runs):
         all_runs.append(metrics)
 
     if calibrate_runs > 0:
-        print_calibration_stats(all_runs, pf_metric_keys)
+        print_calibration_stats(all_runs, pf_metric_keys, confidence_interval=ci_level)
         return
 
     metrics = all_runs[0]
@@ -216,7 +225,7 @@ def cleanup_opf_test_artifacts():
             shutil.rmtree(d, ignore_errors=True)
 
 
-def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs):
+def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs, ci_level):
     """
     Integration test for OPF data download and gridfm-graphkit OPF training.
 
@@ -283,7 +292,7 @@ def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs):
         all_runs.append(metrics)
 
     if calibrate_runs > 0:
-        print_calibration_stats(all_runs, opf_metric_keys)
+        print_calibration_stats(all_runs, opf_metric_keys, confidence_interval=ci_level)
         return
 
     metrics = all_runs[0]

From e55cac90a7242bb21a403ad3e1315f2d371e2cfc Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Sat, 9 May 2026 09:10:53 +0200
Subject: [PATCH 12/18] Update calibration stats and tests to reflect 99.5%
 confidence intervals

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 134d043d..53af2b6f 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -46,7 +46,7 @@ def print_calibration_stats(all_runs: list, metric_keys: list, confidence_interv
     n = len(all_runs)
     alpha_half = (1 + confidence_interval) / 2
     t_crit = stats.t.ppf(alpha_half, df=max(n - 1, 1))
-    ci_pct = int(confidence_interval * 100)
+    ci_pct = f"{confidence_interval * 100:g}"
     col_w = max(len(k) for k in metric_keys) + 2
     header = f"  {'Metric':<{col_w}}  {'Mean':>10}  {'Std(ddof=1)':>12}  {f'CI {ci_pct}% lo':>10}  {f'CI {ci_pct}% hi':>10}"
     print(f"\n===== Calibration Results (n={n}, CI={confidence_interval}, t_crit={t_crit:.4f}) =====")
@@ -206,11 +206,11 @@ def test_train(cleanup_test_artifacts, calibrate_runs, ci_level):
     metrics = all_runs[0]
     pbe_mean_value = metrics["PBE Mean"]
 
-    assert 0.2239 <= pbe_mean_value <= 0.6535, (
-        f"PBE Mean value {pbe_mean_value} is outside 95% CI [0.2239, 0.6535]"
+    assert -0.0171 <= pbe_mean_value <= 0.8610, (
+        f"PBE Mean value {pbe_mean_value} is outside 99.5% CI [-0.0171, 0.8610]"
     )
 
-    print(f"PBE Mean value {pbe_mean_value} is within 95% CI [0.2239, 0.6535]")
+    print(f"PBE Mean value {pbe_mean_value} is within 99.5% CI [-0.0171, 0.8610]")
 
 
 @pytest.fixture
@@ -298,22 +298,22 @@ def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs, ci_level):
     metrics = all_runs[0]
 
     checks = {
-        "Avg. active res. (MW)": (0.2652, 0.4217),
-        "Avg. reactive res. (MVar)": (0.1005, 0.1827),
-        "RMSE PG generators (MW)": (2.6918, 3.1753),
-        "Mean optimality gap (%)": (1.2375, 1.5709),
+        "Avg. active res. (MW)": (0.2067, 0.4619),
+        "Avg. reactive res. (MVar)": (0.0825, 0.1492),
+        "RMSE PG generators (MW)": (2.6480, 2.8693),
+        "Mean optimality gap (%)": (1.1039, 1.4934),
         "Mean branch thermal violation from (MVA)": (0.0, 0.0),
         "Mean branch thermal violation to (MVA)": (0.0, 0.0),
         "Mean branch angle difference violation (radians)": (0.0, 0.0),
-        "Mean Qg violation PV buses": (-0.0008, 0.2543),
-        "Mean Qg violation REF buses": (0.0245, 0.4099),
-        "Mean Qg violation": (0.0108, 0.2801),
+        "Mean Qg violation PV buses": (0.0167, 0.1546),
+        "Mean Qg violation REF buses": (-0.0693, 0.4241),
+        "Mean Qg violation": (0.0771, 0.1322),
     }
 
     for metric_name, (lo, hi) in checks.items():
         assert metric_name in metrics, f"Metric '{metric_name}' not found in CSV"
         value = metrics[metric_name]
         assert lo <= value <= hi, (
-            f"Metric '{metric_name}' value {value} is outside 95% CI [{lo}, {hi}]"
+            f"Metric '{metric_name}' value {value} is outside 99.5% CI [{lo}, {hi}]"
         )
-        print(f"{metric_name}: {value} is within 95% CI [{lo}, {hi}]")
+        print(f"{metric_name}: {value} is within 99.5% CI [{lo}, {hi}]")

From 143f81a5bae0e65200a278ea0160a9241b7d9814 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Mon, 11 May 2026 10:45:41 +0200
Subject: [PATCH 13/18] Implement retry logic for metric validation in training
 tests

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 81 ++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index 53af2b6f..ad238ec5 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -203,14 +203,37 @@ def test_train(cleanup_test_artifacts, calibrate_runs, ci_level):
         print_calibration_stats(all_runs, pf_metric_keys, confidence_interval=ci_level)
         return
 
-    metrics = all_runs[0]
-    pbe_mean_value = metrics["PBE Mean"]
-
-    assert -0.0171 <= pbe_mean_value <= 0.8610, (
-        f"PBE Mean value {pbe_mean_value} is outside 99.5% CI [-0.0171, 0.8610]"
-    )
-
-    print(f"PBE Mean value {pbe_mean_value} is within 99.5% CI [-0.0171, 0.8610]")
+    MAX_RETRIES = 5
+    last_error = None
+    for attempt in range(1, MAX_RETRIES + 1):
+        if attempt > 1:
+            print(f"\n--- PF Retry attempt {attempt}/{MAX_RETRIES} after metric interval failure ---")
+            execute_and_live_output(
+                f"gridfm_graphkit train "
+                f"--config {training_config_path} "
+                f"--data_path data_out/ "
+                f"--exp_name exp1 "
+                f"--run_name retry{attempt} "
+                f"--log_dir logs",
+            )
+            metrics = collect_metrics_from_log("logs", pf_metric_keys)
+        else:
+            metrics = all_runs[0]
+
+        pbe_mean_value = metrics["PBE Mean"]
+        try:
+            assert -0.0171 <= pbe_mean_value <= 0.8610, (
+                f"PBE Mean value {pbe_mean_value} is outside 99.5% CI [-0.0171, 0.8610]"
+            )
+            print(f"PBE Mean value {pbe_mean_value} is within 99.5% CI [-0.0171, 0.8610] (attempt {attempt})")
+            last_error = None
+            break
+        except AssertionError as e:
+            print(f"Attempt {attempt}/{MAX_RETRIES} failed: {e}")
+            last_error = e
+
+    if last_error is not None:
+        raise last_error
 
 
 @pytest.fixture
@@ -295,8 +318,6 @@ def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs, ci_level):
         print_calibration_stats(all_runs, opf_metric_keys, confidence_interval=ci_level)
         return
 
-    metrics = all_runs[0]
-
     checks = {
         "Avg. active res. (MW)": (0.2067, 0.4619),
         "Avg. reactive res. (MVar)": (0.0825, 0.1492),
@@ -310,10 +331,36 @@ def test_train_opf(cleanup_opf_test_artifacts, calibrate_runs, ci_level):
         "Mean Qg violation": (0.0771, 0.1322),
     }
 
-    for metric_name, (lo, hi) in checks.items():
-        assert metric_name in metrics, f"Metric '{metric_name}' not found in CSV"
-        value = metrics[metric_name]
-        assert lo <= value <= hi, (
-            f"Metric '{metric_name}' value {value} is outside 99.5% CI [{lo}, {hi}]"
-        )
-        print(f"{metric_name}: {value} is within 99.5% CI [{lo}, {hi}]")
+    MAX_RETRIES = 5
+    last_error = None
+    for attempt in range(1, MAX_RETRIES + 1):
+        if attempt > 1:
+            print(f"\n--- OPF Retry attempt {attempt}/{MAX_RETRIES} after metric interval failure ---")
+            execute_and_live_output(
+                f"gridfm_graphkit train "
+                f"--config {training_config_path} "
+                f"--data_path {opf_data_dir}/ "
+                f"--exp_name exp_opf "
+                f"--run_name retry{attempt} "
+                f"--log_dir logs_opf",
+            )
+            metrics = collect_metrics_from_log("logs_opf", opf_metric_keys)
+        else:
+            metrics = all_runs[0]
+
+        try:
+            for metric_name, (lo, hi) in checks.items():
+                assert metric_name in metrics, f"Metric '{metric_name}' not found in CSV"
+                value = metrics[metric_name]
+                assert lo <= value <= hi, (
+                    f"Metric '{metric_name}' value {value} is outside 99.5% CI [{lo}, {hi}]"
+                )
+                print(f"{metric_name}: {value} is within 99.5% CI [{lo}, {hi}] (attempt {attempt})")
+            last_error = None
+            break
+        except AssertionError as e:
+            print(f"Attempt {attempt}/{MAX_RETRIES} failed: {e}")
+            last_error = e
+
+    if last_error is not None:
+        raise last_error

From 20e75dee732062708d6b8b357e64dc3d79204aac Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 15 May 2026 09:18:11 +0200
Subject: [PATCH 14/18] Add note about `torch-scatter` as a required dependency
 in installation instructions

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 README.md                    | 2 ++
 docs/install/installation.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 753fdb7e..acd216b6 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ Install gridfm-graphkit in editable mode
 pip install -e .
 ```
 
+**`torch-scatter` is a required dependency.** It cannot be bundled in `pyproject.toml` because the correct wheel depends on your PyTorch and CUDA versions, so it must be installed separately.
+
 Get PyTorch + CUDA version for torch-scatter
 ```bash
 TORCH_CUDA_VERSION=$(python -c "import torch; print(torch.__version__ + ('+cpu' if torch.version.cuda is None else ''))")
diff --git a/docs/install/installation.md b/docs/install/installation.md
index c65ab752..345091a9 100644
--- a/docs/install/installation.md
+++ b/docs/install/installation.md
@@ -15,6 +15,8 @@ Install gridfm-graphkit in editable mode
 pip install -e .
 ```
 
+**`torch-scatter` is a required dependency.** It cannot be bundled in `pyproject.toml` because the correct wheel depends on your PyTorch and CUDA versions, so it must be installed separately.
+
 Get PyTorch + CUDA version for torch-scatter
 
 ```bash

From d1a795baa20222113a9c5e64f9f03bb7d9130e47 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Mon, 18 May 2026 23:55:58 +0200
Subject: [PATCH 15/18] Update default confidence interval to 0.995 in
 calibration options and related functions

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/conftest.py      | 6 +++---
 integrationtests/test_base_set.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/integrationtests/conftest.py b/integrationtests/conftest.py
index 5ec50984..01737f5e 100644
--- a/integrationtests/conftest.py
+++ b/integrationtests/conftest.py
@@ -12,8 +12,8 @@ def pytest_addoption(parser):
     parser.addoption(
         "--ci",
         type=float,
-        default=0.95,
-        help="Confidence interval level for calibration stats (default 0.95). "
+        default=0.995,
+        help="Confidence interval level for calibration stats (default 0.995). "
         "Example: pytest --calibrate 5 -s --ci 0.995",
     )
 
@@ -26,6 +26,6 @@ def calibrate_runs(request):
 
 @pytest.fixture
 def ci_level(request):
-    """Confidence interval level requested via --ci (default 0.95)."""
+    """Confidence interval level requested via --ci (default 0.995)."""
     return request.config.getoption("--ci")
 
diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index ad238ec5..dcdf543c 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -30,7 +30,7 @@ def collect_metrics_from_log(log_base: str, metric_keys: list) -> dict:
     return dict(zip(df["Metric"], df["Value"].astype(float)))
 
 
-def print_calibration_stats(all_runs: list, metric_keys: list, confidence_interval: float = 0.95) -> None:
+def print_calibration_stats(all_runs: list, metric_keys: list, confidence_interval: float = 0.995) -> None:
     """
     Print per-metric stats across calibration runs:
       - std with Bessel's correction (ddof=1)
@@ -39,7 +39,7 @@ def print_calibration_stats(all_runs: list, metric_keys: list, confidence_interv
     Args:
         all_runs: list of per-run metric dicts
         metric_keys: list of metric names to report
-        confidence_interval: desired confidence level (default 0.95).
+        confidence_interval: desired confidence level (default 0.995).
             Example with higher confidence:
                 print_calibration_stats(all_runs, metric_keys, confidence_interval=0.995)
     """

From 2021827c21fb350e304aaf9b2c769713e8ae708d Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Mon, 18 May 2026 23:57:22 +0200
Subject: [PATCH 16/18] Rename test function to clarify focus on power flow
 integration and update description for accuracy

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index dcdf543c..a1614b99 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -146,13 +146,13 @@ def cleanup_test_artifacts():
             shutil.rmtree(d, ignore_errors=True)
 
 
-def test_train(cleanup_test_artifacts, calibrate_runs, ci_level):
+def test_train_pf(cleanup_test_artifacts, calibrate_runs, ci_level):
     """
-    Integration test for gridfm-datakit data generation and gridfm-graphkit training.
+    Integration test for power flow (PF): gridfm-datakit data generation and gridfm-graphkit training.
 
     Steps:
-    1. Generate power grid data using gridfm-datakit
-    2. Train a model using gridfm-graphkit
+    1. Generate power flow grid data using gridfm-datakit
+    2. Train a PF model using gridfm-graphkit
     3. Validate the PBE Mean metric
 
     Pass --calibrate N to pytest (e.g. pytest --calibrate 5) to run N training passes

From 1176d990c9aa3d6ba332a5a35473100ae66c9995 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Tue, 19 May 2026 00:02:02 +0200
Subject: [PATCH 17/18] Update test assertions to reflect 95% confidence
 interval for PBE Mean value

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index a1614b99..f03ca420 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -223,9 +223,9 @@ def test_train_pf(cleanup_test_artifacts, calibrate_runs, ci_level):
         pbe_mean_value = metrics["PBE Mean"]
         try:
             assert -0.0171 <= pbe_mean_value <= 0.8610, (
-                f"PBE Mean value {pbe_mean_value} is outside 99.5% CI [-0.0171, 0.8610]"
+                f"PBE Mean value {pbe_mean_value} is outside 95% CI [-0.0171, 0.8610]"
             )
-            print(f"PBE Mean value {pbe_mean_value} is within 99.5% CI [-0.0171, 0.8610] (attempt {attempt})")
+            print(f"PBE Mean value {pbe_mean_value} is within 95% CI [-0.0171, 0.8610] (attempt {attempt})")
             last_error = None
             break
         except AssertionError as e:

From df99154cf2bce13cba447a87b262e2a9cb59460d Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Tue, 19 May 2026 00:05:44 +0200
Subject: [PATCH 18/18] Update PBE Mean value assertions to reflect new 95%
 confidence interval

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 integrationtests/test_base_set.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
index f03ca420..2dfe4d8f 100644
--- a/integrationtests/test_base_set.py
+++ b/integrationtests/test_base_set.py
@@ -222,10 +222,10 @@ def test_train_pf(cleanup_test_artifacts, calibrate_runs, ci_level):
 
         pbe_mean_value = metrics["PBE Mean"]
         try:
-            assert -0.0171 <= pbe_mean_value <= 0.8610, (
-                f"PBE Mean value {pbe_mean_value} is outside 95% CI [-0.0171, 0.8610]"
+            assert 0.2042 <= pbe_mean_value <= 0.6397, (
+                f"PBE Mean value {pbe_mean_value} is outside 95% CI [0.2042, 0.6397]"
             )
-            print(f"PBE Mean value {pbe_mean_value} is within 95% CI [-0.0171, 0.8610] (attempt {attempt})")
+            print(f"PBE Mean value {pbe_mean_value} is within 95% CI [0.2042, 0.6397] (attempt {attempt})")
             last_error = None
             break
         except AssertionError as e: