From c0a09cb5c8fab288ddaaa309541f488a538e3c2e Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanjh88@gmail.com>
Date: Wed, 4 Jun 2025 16:41:10 -0500
Subject: [PATCH 01/85] qBraid integration MVP (#4)

* working implementation using openQasm

* modified and added test files(incomplete)

* fix emulate command alignment

* update polling + format

* update polling interval and make code more readable

* remove ionq fields from target-arguments

* fix formatting

* Add qBraid mock python server for testing

Signed-off-by: Ryan Hill <ryanjh88@gmail.com>

* Update __init__.py

Signed-off-by: Ryan Hill <ryanjh88@gmail.com>

* QbraidTester running correctly

* added documentation for qbraid

---------

Signed-off-by: Ryan Hill <ryanjh88@gmail.com>
Co-authored-by: feelerx <superfeelerxx@gmail.com>
---
 .github/workflows/integration_tests.yml       |   5 +
 docs/sphinx/targets/cpp/qbraid.cpp            |  49 +++
 docs/sphinx/targets/python/qbraid.py          |  52 +++
 docs/sphinx/using/backends/cloud.rst          |   6 +
 docs/sphinx/using/backends/cloud/qbraid.rst   |  62 ++++
 .../using/backends/hardware/iontrap.rst       |  68 ++++
 lib/Optimizer/CodeGen/Passes.cpp              |  14 +
 .../default/rest/helpers/CMakeLists.txt       |   3 +
 .../rest/helpers/qbraid/CMakeLists.txt        |  17 +
 .../helpers/qbraid/QbraidServerHelper.cpp     | 295 ++++++++++++++++++
 .../default/rest/helpers/qbraid/qbraid.yml    |  30 ++
 targettests/execution/cudaq_observe-cpp17.cpp |  56 ++++
 targettests/qbraid/bug_qubit.cpp              |  50 +++
 targettests/qbraid/callable_kernel_arg.cpp    |  50 +++
 targettests/qbraid/cudaq_observe.cpp          |  57 ++++
 targettests/qbraid/if_jit.cpp                 |  45 +++
 targettests/qbraid/load_value.cpp             |  63 ++++
 targettests/qbraid/sudoku_2x2-1.cpp           |  79 +++++
 targettests/qbraid/sudoku_2x2-bit_names.cpp   | 103 ++++++
 targettests/qbraid/sudoku_2x2-reg_name.cpp    |  79 +++++
 targettests/qbraid/sudoku_2x2.cpp             |  78 +++++
 targettests/qbraid/swap_gate.cpp              |  43 +++
 targettests/qbraid/test-int8_t.cpp            |  48 +++
 targettests/qbraid/test-int8_t_free_func.cpp  |  46 +++
 targettests/qbraid/variable_size_qreg.cpp     |  46 +++
 tpls/Stim                                     |   2 +-
 tpls/cpr                                      |   2 +-
 tpls/fmt                                      |   2 +-
 tpls/spdlog                                   |   2 +-
 unittests/backends/CMakeLists.txt             |  13 +-
 unittests/backends/qbraid/CMakeLists.txt      |  27 ++
 .../qbraid/QbraidStartServerAndTest.sh.in     |  43 +++
 unittests/backends/qbraid/QbraidTester.cpp    | 177 +++++++++++
 utils/mock_qpu/__init__.py                    |   1 +
 utils/mock_qpu/qbraid/__init__.py             | 240 ++++++++++++++
 35 files changed, 1944 insertions(+), 9 deletions(-)
 create mode 100644 docs/sphinx/targets/cpp/qbraid.cpp
 create mode 100644 docs/sphinx/targets/python/qbraid.py
 create mode 100644 docs/sphinx/using/backends/cloud/qbraid.rst
 create mode 100644 runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
 create mode 100644 runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
 create mode 100644 runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
 create mode 100644 targettests/execution/cudaq_observe-cpp17.cpp
 create mode 100644 targettests/qbraid/bug_qubit.cpp
 create mode 100644 targettests/qbraid/callable_kernel_arg.cpp
 create mode 100644 targettests/qbraid/cudaq_observe.cpp
 create mode 100644 targettests/qbraid/if_jit.cpp
 create mode 100644 targettests/qbraid/load_value.cpp
 create mode 100644 targettests/qbraid/sudoku_2x2-1.cpp
 create mode 100644 targettests/qbraid/sudoku_2x2-bit_names.cpp
 create mode 100644 targettests/qbraid/sudoku_2x2-reg_name.cpp
 create mode 100644 targettests/qbraid/sudoku_2x2.cpp
 create mode 100644 targettests/qbraid/swap_gate.cpp
 create mode 100644 targettests/qbraid/test-int8_t.cpp
 create mode 100644 targettests/qbraid/test-int8_t_free_func.cpp
 create mode 100644 targettests/qbraid/variable_size_qreg.cpp
 create mode 100644 unittests/backends/qbraid/CMakeLists.txt
 create mode 100644 unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
 create mode 100644 unittests/backends/qbraid/QbraidTester.cpp
 create mode 100644 utils/mock_qpu/qbraid/__init__.py

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 47cce0eb1e3..ee781d737b5 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -23,11 +23,16 @@ on:
           - iqm
           - oqc
           - orca
+<<<<<<< HEAD
           - pasqal
           - qci
           - quantinuum
           - scaleway
           - tii
+=======
+          - fermioniq
+          - qbraid
+>>>>>>> 17f25cf4 (qBraid integration MVP (#4))
       single_test_name:
         type: string
         required: false
diff --git a/docs/sphinx/targets/cpp/qbraid.cpp b/docs/sphinx/targets/cpp/qbraid.cpp
new file mode 100644
index 00000000000..4b696005582
--- /dev/null
+++ b/docs/sphinx/targets/cpp/qbraid.cpp
@@ -0,0 +1,49 @@
+// Compile and run with:
+// ```
+// nvq++ --target qbraid qbraid.cpp -o out.x && ./out.x
+// ```
+// This will submit the job to the Qbraid ideal simulator target (default).
+
+
+#include <cudaq.h>
+#include <fstream>
+
+// Define a simple quantum kernel to execute on Qbraid.
+struct ghz {
+  // Maximally entangled state between 5 qubits.
+  auto operator()() __qpu__ {
+    cudaq::qvector q(5);
+    h(q[0]);
+    for (int i = 0; i < 4; i++) {
+      x<cudaq::ctrl>(q[i], q[i + 1]);
+    }
+    auto result = mz(q);
+  }
+};
+
+int main() {
+  // Submit to Qbraid asynchronously (e.g., continue executing
+  // code in the file until the job has been returned).
+  auto future = cudaq::sample_async(ghz{});
+  // ... classical code to execute in the meantime ...
+
+  // Can write the future to file:
+  {
+    std::ofstream out("saveMe.json");
+    out << future;
+  }
+
+  // Then come back and read it in later.
+  cudaq::async_result<cudaq::sample_result> readIn;
+  std::ifstream in("saveMe.json");
+  in >> readIn;
+
+  // Get the results of the read in future.
+  auto async_counts = readIn.get();
+  async_counts.dump();
+
+  // OR: Submit to Qbraid synchronously (e.g., wait for the job
+  // result to be returned before proceeding).
+  auto counts = cudaq::sample(ghz{});
+  counts.dump();
+}
diff --git a/docs/sphinx/targets/python/qbraid.py b/docs/sphinx/targets/python/qbraid.py
new file mode 100644
index 00000000000..8450e3a6fd8
--- /dev/null
+++ b/docs/sphinx/targets/python/qbraid.py
@@ -0,0 +1,52 @@
+import cudaq
+
+# You only have to set the target once! No need to redefine it
+# for every execution call on your kernel.
+# To use different targets in the same file, you must update
+# it via another call to `cudaq.set_target()`
+cudaq.set_target("qbraid")
+
+
+# Create the kernel we'd like to execute on Qbraid.
+@cudaq.kernel
+def kernel():
+    qvector = cudaq.qvector(2)
+    h(qvector[0])
+    x.ctrl(qvector[0], qvector[1])
+
+
+
+# Execute on Qbraid and print out the results.
+
+# Option A:
+# By using the asynchronous `cudaq.sample_async`, the remaining
+# classical code will be executed while the job is being handled
+# by IonQ. This is ideal when submitting via a queue over
+# the cloud.
+async_results = cudaq.sample_async(kernel)
+# ... more classical code to run ...
+
+# We can either retrieve the results later in the program with
+# ```
+# async_counts = async_results.get()
+# ```
+# or we can also write the job reference (`async_results`) to
+# a file and load it later or from a different process.
+file = open("future.txt", "w")
+file.write(str(async_results))
+file.close()
+
+# We can later read the file content and retrieve the job
+# information and results.
+same_file = open("future.txt", "r")
+retrieved_async_results = cudaq.AsyncSampleResult(str(same_file.read()))
+
+counts = retrieved_async_results.get()
+print(counts)
+
+# Option B:
+# By using the synchronous `cudaq.sample`, the execution of
+# any remaining classical code in the file will occur only
+# after the job has been returned from Qbraid.
+counts = cudaq.sample(kernel)
+print(counts)
\ No newline at end of file
diff --git a/docs/sphinx/using/backends/cloud.rst b/docs/sphinx/using/backends/cloud.rst
index 8c03a4398cc..2395dd6d3b1 100644
--- a/docs/sphinx/using/backends/cloud.rst
+++ b/docs/sphinx/using/backends/cloud.rst
@@ -7,4 +7,10 @@ CUDA-Q provides a number of options to access hardware resources (GPUs and QPUs)
    :maxdepth: 1
       
         Amazon Braket (braket) <cloud/braket.rst>
+<<<<<<< HEAD
         Scaleway QaaS (scaleway) <cloud/scaleway.rst>
+=======
+        NVIDIA Quantum Cloud (nvqc) <cloud/nvqc.rst>
+        Qbraid <cloud/qbraid.rst>
+
+>>>>>>> 17f25cf4 (qBraid integration MVP (#4))
diff --git a/docs/sphinx/using/backends/cloud/qbraid.rst b/docs/sphinx/using/backends/cloud/qbraid.rst
new file mode 100644
index 00000000000..91184e6b934
--- /dev/null
+++ b/docs/sphinx/using/backends/cloud/qbraid.rst
@@ -0,0 +1,62 @@
+QBRAID
++++++++
+
+.. _qbraid-backend:
+
+Setting Credentials
+`````````````````````````
+
+Programmers of CUDA-Q may access the `Qbraid Devices
+<https://account.qbraid.com//>`__ from either C++ or Python. Generate
+an API key from your `Qbraid account <https://account.qbraid.com//>`__ and export
+it as an environment variable:
+
+.. code:: bash
+
+  export QBRAID_API_KEY="qbraid_generated_api_key"
+
+
+Submission from Python
+`````````````````````````
+
+    First, set the :code:`qbraid` backend.
+
+    .. code:: python
+
+        cudaq.set_target('qbraid')
+
+    By default, quantum kernel code will be submitted to the IonQ simulator on qBraid.
+
+   To emulate the qbraid's simulator locally, without submitting through the cloud, you can also set the ``emulate`` flag to ``True``. This will emit any target specific compiler diagnostics.
+
+   .. code:: python
+
+       cudaq.set_target('qbraid', emulate=True)
+
+   The number of shots for a kernel execution can be set through the ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. By default, the ``shots_count`` is set to 1000.
+
+   .. code:: python
+
+       cudaq.sample(kernel, shots_count=10000)
+
+   To see a complete example for using Qbraid's backends, take a look at our :doc:`Python examples <../../examples/examples>`.
+
+Submission from C++
+`````````````````````````
+        To target quantum kernel code for execution using qbraid,
+        pass the flag ``--target qbraid`` to the ``nvq++`` compiler.
+
+        .. code:: bash
+
+                nvq++ --target qbraid src.cpp
+
+        This will take the API key and handle all authentication with, and submission to, the Qbraid device. By default, quantum kernel code will be submitted to the Qbraidsimulator.
+
+        To emulate the qbraid's machine locally, without submitting through the cloud, you can also pass the ``--emulate`` flag to ``nvq++``. This will emit any target  specific compiler diagnostics, before running a noise free emulation.
+
+        .. code:: bash
+
+                nvq++ --emulate --target qbraid src.cpp
+
+        To see a complete example for using IonQ's backends, take a look at our :doc:`C++ examples <../../examples/examples>`.
+  
\ No newline at end of file
diff --git a/docs/sphinx/using/backends/hardware/iontrap.rst b/docs/sphinx/using/backends/hardware/iontrap.rst
index 83e25326455..3d5db2a90e4 100644
--- a/docs/sphinx/using/backends/hardware/iontrap.rst
+++ b/docs/sphinx/using/backends/hardware/iontrap.rst
@@ -220,6 +220,7 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
 .. note:: 
 
+<<<<<<< HEAD
        Quantinuum's syntax checker for Helios (e.g., ``Helios-1SC``) only performs QIR code validation and does not return any results.
        Thus, it always returns an empty result set. This is different from other Quantinuum backends (e.g., ``H2-1SC``) where the syntax checker returns dummy results.
        As a result, when using the Helios syntax checker, we may receive this warning message:
@@ -244,3 +245,70 @@ To see a complete example, take a look at :ref:`Quantinuum examples <quantinuum-
         depend on the selected simulator.  
         
         Any environment variables must be set prior to setting the target or running "`import cudaq`".
+=======
+
+QBRAID
++++++++
+
+.. _qbraid-backend:
+
+Setting Credentials
+`````````````````````````
+
+Programmers of CUDA-Q may access the `Qbraid Devices
+<https://account.qbraid.com//>`__ from either C++ or Python. Generate
+an API key from your `Qbraid account <https://account.qbraid.com//>`__ and export
+it as an environment variable:
+
+.. code:: bash
+
+  export QBRAID_API_KEY="qbraid_generated_api_key"
+
+
+Submitting
+`````````````````````````
+.. tab:: Python
+
+    First, set the :code:`qbraid` backend.
+
+    .. code:: python
+
+        cudaq.set_target('qbraid')
+
+    By default, quantum kernel code will be submitted to the IonQ simulator on qBraid.
+
+   To emulate the qbraid's simulator locally, without submitting through the cloud, you can also set the ``emulate`` flag to ``True``. This will emit any target specific compiler diagnostics.
+
+   .. code:: python
+
+       cudaq.set_target('qbraid', emulate=True)
+
+   The number of shots for a kernel execution can be set through the ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. By default, the ``shots_count`` is set to 1000.
+
+   .. code:: python
+
+       cudaq.sample(kernel, shots_count=10000)
+
+   To see a complete example for using Qbraid's backends, take a look at our :doc:`Python examples <../../examples/examples>`.
+
+
+.. tab:: C++
+
+        To target quantum kernel code for execution using qbraid,
+        pass the flag ``--target qbraid`` to the ``nvq++`` compiler.
+
+        .. code:: bash
+
+                nvq++ --target qbraid src.cpp
+
+        This will take the API key and handle all authentication with, and submission to, the Qbraid device. By default, quantum kernel code will be submitted to the Qbraidsimulator.
+
+        To emulate the qbraid's machine locally, without submitting through the cloud, you can also pass the ``--emulate`` flag to ``nvq++``. This will emit any target  specific compiler diagnostics, before running a noise free emulation.
+
+        .. code:: bash
+
+                nvq++ --emulate --target qbraid src.cpp
+
+        To see a complete example for using IonQ's backends, take a look at our :doc:`C++ examples <../../examples/examples>`.
+  
+>>>>>>> 17f25cf4 (qBraid integration MVP (#4))
diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp
index 8ff6c53c2d1..ce9795bf0c8 100644
--- a/lib/Optimizer/CodeGen/Passes.cpp
+++ b/lib/Optimizer/CodeGen/Passes.cpp
@@ -100,6 +100,17 @@ static void addFermioniqPipeline(OpPassManager &pm) {
   pm.addPass(createBasisConversion(options));
 }
 
+static void addQbraidPipeline(OpPassManager &pm) {
+  using namespace cudaq::opt;
+  std::string basis[] = {
+      "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
+  };
+  BasisConversionPassOptions options;
+  options.basis = basis;
+  options.disabledPatterns = z_disabledPatterns;
+  pm.addPass(createBasisConversionPass(options));
+}
+
 void cudaq::opt::registerTargetPipelines() {
   PassPipelineRegistration<>("anyon-cgate-set-mapping",
                              "Convert kernels to Anyon gate set.",
@@ -125,6 +136,9 @@ void cudaq::opt::registerTargetPipelines() {
   PassPipelineRegistration<>("fermioniq-gate-set-mapping",
                              "Convert kernels to Fermioniq gate set.",
                              addFermioniqPipeline);
+  PassPipelineRegistration<>("qbraid-gate-set-mapping",
+                              "Convert kernels to qBraid gate set.",
+                              addQbraidPipeline);
 }
 
 void cudaq::opt::registerCodeGenDialect(DialectRegistry &registry) {
diff --git a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
index 5daa54ea114..4574b6ba8fe 100644
--- a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
@@ -27,3 +27,6 @@ endif()
 if(CUDAQ_ENABLE_TII_BACKEND)
   add_subdirectory(tii)
 endif()
+if(CUDAQ_ENABLE_QBRAID_BACKEND)
+  add_subdirectory(qbraid)
+endif()
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
new file mode 100644
index 00000000000..05b059ecd25
--- /dev/null
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
@@ -0,0 +1,17 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+target_sources(cudaq-rest-qpu PRIVATE QbraidServerHelper.cpp)
+add_target_config(qbraid)
+
+add_library(cudaq-serverhelper-qbraid SHARED QbraidServerHelper.cpp )
+target_link_libraries(cudaq-serverhelper-qbraid
+  PUBLIC 
+    cudaq-common 
+    fmt::fmt-header-only 
+)
+install(TARGETS cudaq-serverhelper-qbraid DESTINATION lib)
\ No newline at end of file
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
new file mode 100644
index 00000000000..5e2bf74787e
--- /dev/null
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -0,0 +1,295 @@
+#include "common/Logger.h"
+#include "common/RestClient.h"
+#include "common/ServerHelper.h"
+#include "cudaq/Support/Version.h"
+#include "cudaq/utils/cudaq_utils.h"
+#include <bitset>
+#include <fstream>
+#include <map>
+#include <thread>
+
+namespace cudaq {
+
+class QbraidServerHelper : public ServerHelper {
+  static constexpr const char *DEFAULT_URL = "https://api.qbraid.com/api";
+  static constexpr const char *DEFAULT_DEVICE = "ionq_simulator";
+  static constexpr int DEFAULT_QUBITS = 29;
+
+public:
+  const std::string name() const override { return "qbraid"; }
+
+  void initialize(BackendConfig config) override {
+    cudaq::info("Initializing Qbraid Backend.");
+
+    backendConfig.clear();
+    backendConfig["url"] = getValueOrDefault(config, "url", DEFAULT_URL);
+    backendConfig["device_id"] = getValueOrDefault(config, "device_id", DEFAULT_DEVICE);
+    backendConfig["user_agent"] = "cudaq/" + std::string(cudaq::getVersion());
+    backendConfig["qubits"] = std::to_string(DEFAULT_QUBITS);
+
+    backendConfig["api_key"] = getEnvVar("QBRAID_API_KEY", "", true);
+    backendConfig["job_path"] = backendConfig["url"] + "/quantum-jobs";
+    backendConfig["results_path"] = backendConfig["url"] + "/quantum-jobs/result/";
+
+    backendConfig["results_output_dir"] = getValueOrDefault(config, "results_output_dir", "./qbraid_results");
+    backendConfig["results_file_prefix"] = getValueOrDefault(config, "results_file_prefix", "qbraid_job_");
+
+    if (!config["shots"].empty()) {
+      backendConfig["shots"] = config["shots"];
+      this->setShots(std::stoul(config["shots"]));
+    } else {
+      backendConfig["shots"] = "1000";
+      this->setShots(1000);
+    }
+
+    parseConfigForCommonParams(config);
+
+    cudaq::info("Qbraid configuration initialized:");
+    for (const auto &[key, value] : backendConfig) {
+      cudaq::info("  {} = {}", key, value);
+    }
+
+    std::string resultsDir = backendConfig["results_output_dir"];
+    std::filesystem::create_directories(resultsDir);
+    cudaq::info("Created results directory: {}", resultsDir);
+  }
+
+  ServerJobPayload
+  createJob(std::vector<KernelExecution> &circuitCodes) override {
+    if (backendConfig.find("job_path") == backendConfig.end()) {
+      throw std::runtime_error("job_path not found in config. Was initialize() called?");
+    }
+
+    std::vector<ServerMessage> jobs;
+    for (auto &circuitCode : circuitCodes) {
+      ServerMessage job;
+      job["qbraidDeviceId"] = backendConfig.at("device_id");
+      job["openQasm"] = circuitCode.code;
+      job["shots"] = std::stoi(backendConfig.at("shots"));
+
+      if (!circuitCode.name.empty()) {
+        nlohmann::json tags;
+        tags["name"] = circuitCode.name;
+        job["tags"] = tags;
+      }
+
+      jobs.push_back(job);
+    }
+
+    return std::make_tuple(backendConfig.at("job_path"), getHeaders(), jobs);
+  }
+
+  std::string extractJobId(ServerMessage &postResponse) override {
+    if (!postResponse.contains("qbraidJobId")) {
+      throw std::runtime_error("ServerMessage doesn't contain 'qbraidJobId' key.");
+    }
+    return postResponse.at("qbraidJobId");
+  }
+
+  std::string constructGetJobPath(ServerMessage &postResponse) override {
+    if (!postResponse.contains("qbraidJobId")) {
+      throw std::runtime_error("ServerMessage doesn't contain 'qbraidJobId' key.");
+    }
+
+    return backendConfig.at("job_path") + "?qbraidJobId=" + postResponse.at("qbraidJobId").get<std::string>();
+  }
+
+  std::string constructGetJobPath(std::string &jobId) override {
+    return backendConfig.at("job_path") + "?qbraidJobId=" + jobId;
+  }
+
+  std::string constructGetResultsPath(const std::string &jobId) {
+    return backendConfig.at("results_path") + jobId;
+  }
+
+  bool jobIsDone(ServerMessage &getJobResponse) override {
+    std::string status;
+
+    if (getJobResponse.contains("jobsArray") && !getJobResponse["jobsArray"].empty()) {
+      status = getJobResponse["jobsArray"][0]["status"].get<std::string>();
+      cudaq::info("Job status from jobs endpoint: {}", status);
+    } else if (getJobResponse.contains("status")) {
+      status = getJobResponse["status"].get<std::string>();
+      cudaq::info("Job status from direct response: {}", status);
+    } else if (getJobResponse.contains("data") && getJobResponse["data"].contains("status")) {
+      status = getJobResponse["data"]["status"].get<std::string>();
+      cudaq::info("Job status from data object: {}", status);
+    } else {
+      cudaq::info("Unexpected job response format: {}", getJobResponse.dump());
+      throw std::runtime_error("Invalid job response format");
+    }
+
+    if (status == "FAILED" || status == "COMPLETED" || status == "CANCELLED") {
+      saveResponseToFile(getJobResponse);
+      return true;
+    }
+
+    return false;
+  }
+
+  // Sample results with results api - with retry logic
+  cudaq::sample_result processResults(ServerMessage &getJobResponse, std::string &jobId) override {
+    int maxRetries = 5;
+    int waitTime = 2;
+    float backoffFactor = 2.0;
+
+    for (int attempt = 0; attempt < maxRetries; ++attempt) {
+      try {
+        auto resultsPath = constructGetResultsPath(jobId);
+        auto headers = getHeaders();
+
+        cudaq::info("Fetching results using direct endpoint (attempt {}/{}): {}", attempt + 1, maxRetries, resultsPath);
+        RestClient client;
+        auto resultJson = client.get("", resultsPath, headers, true);
+
+        if (resultJson.contains("error") && !resultJson["error"].is_null()) {
+          std::string errorMsg = resultJson["error"].is_string()
+                                     ? resultJson["error"].get<std::string>()
+                                     : resultJson["error"].dump();
+          cudaq::info("Error from results endpoint: {}", errorMsg);
+
+          if (attempt == maxRetries - 1) {
+            throw std::runtime_error("Error retrieving results: " + errorMsg);
+          }
+        } else if (resultJson.contains("data") && resultJson["data"].contains("measurementCounts")) {
+          cudaq::info("Processing results from direct endpoint");
+          CountsDictionary counts;
+          auto &measurements = resultJson["data"]["measurementCounts"];
+
+          for (const auto &[bitstring, count] : measurements.items()) {
+            counts[bitstring] =
+                count.is_number()
+                    ? static_cast<std::size_t>(count.get<double>())
+                    : static_cast<std::size_t>(count);
+          }
+
+          std::vector<ExecutionResult> execResults;
+          execResults.emplace_back(ExecutionResult{counts});
+          return cudaq::sample_result(execResults);
+        }
+
+        // If we get here, no valid data was found but also no error - retry
+        if (attempt < maxRetries - 1) {
+          int sleepTime = (attempt == 0) ? waitTime : waitTime * std::pow(backoffFactor, attempt);
+          cudaq::info("No valid results yet, retrying in {} seconds", sleepTime);
+          std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
+        }
+
+      } catch (const std::exception &e) {
+        cudaq::info("Exception when using direct results endpoint: {}", e.what());
+        if (attempt < maxRetries - 1) {
+          int sleepTime = (attempt == 0) ? waitTime : waitTime * std::pow(backoffFactor, attempt);
+          cudaq::info("Retrying in {} seconds", sleepTime);
+          std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
+        } else {
+          cudaq::info("Falling back to original results processing method");
+        }
+      }
+    }
+
+    // Original result processing as fallback
+    cudaq::info("Processing results from job response for job {}", jobId);
+    if (getJobResponse.contains("jobsArray") && !getJobResponse["jobsArray"].empty()) {
+      auto &job = getJobResponse["jobsArray"][0];
+
+      if (job.contains("measurementCounts")) {
+        CountsDictionary counts;
+        auto &measurements = job["measurementCounts"];
+
+        for (const auto &[bitstring, count] : measurements.items()) {
+          counts[bitstring] = count.get<std::size_t>();
+        }
+
+        std::vector<ExecutionResult> execResults;
+        execResults.emplace_back(ExecutionResult{counts});
+        return cudaq::sample_result(execResults);
+      }
+    }
+
+    // Last resort - check for direct measurementCounts in the response
+    if (getJobResponse.contains("measurementCounts")) {
+      CountsDictionary counts;
+      auto &measurements = getJobResponse["measurementCounts"];
+
+      for (const auto &[bitstring, count] : measurements.items()) {
+        counts[bitstring] = count.get<std::size_t>();
+      }
+
+      std::vector<ExecutionResult> execResults;
+      execResults.emplace_back(ExecutionResult{counts});
+      return cudaq::sample_result(execResults);
+    }
+
+    throw std::runtime_error("No measurement counts found in any response format");
+  }
+
+  /// @brief Override the polling interval method
+  std::chrono::microseconds
+  nextResultPollingInterval(ServerMessage &postResponse) override {
+    return std::chrono::seconds(1);
+  }
+
+private:
+  void saveResponseToFile(const ServerMessage &response, const std::string &identifier = "") {
+    try {
+      std::string outputDir = backendConfig.at("results_output_dir");
+      std::string filePrefix = backendConfig.at("results_file_prefix");
+
+      // Create a unique filename using timestamp if no identifier is provided
+      std::string filename;
+      if (identifier.empty()) {
+        auto now = std::chrono::system_clock::now();
+        auto timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
+        filename = outputDir + "/" + filePrefix + std::to_string(timestamp) + ".json";
+      } else {
+        filename = outputDir + "/" + filePrefix + identifier + ".json";
+      }
+
+      std::ofstream outputFile(filename);
+      if (!outputFile.is_open()) {
+        cudaq::info("Failed to open file for writing: {}", filename);
+        return;
+      }
+
+      outputFile << response.dump(2);
+      outputFile.close();
+
+      cudaq::info("Response saved to file: {}", filename);
+    } catch (const std::exception &e) {
+      cudaq::info("Error saving response to file: {}", e.what());
+    }
+  }
+
+  RestHeaders getHeaders() override {
+    if (backendConfig.find("api_key") == backendConfig.end()) {
+      throw std::runtime_error("API key not found in config. Was initialize() called?");
+    }
+
+    RestHeaders headers;
+    headers["api-key"] = backendConfig.at("api_key");
+    headers["Content-Type"] = "application/json";
+    headers["User-Agent"] = backendConfig.at("user_agent");
+    return headers;
+  }
+
+  std::string getEnvVar(const std::string &key, const std::string &defaultVal, const bool isRequired) const {
+    const char *env_var = std::getenv(key.c_str());
+    if (env_var == nullptr) {
+      if (isRequired) {
+        throw std::runtime_error(key + " environment variable is not set.");
+      }
+
+      return defaultVal;
+    }
+    return std::string(env_var);
+  }
+
+  std::string getValueOrDefault(const BackendConfig &config,
+                                const std::string &key,
+                                const std::string &defaultValue) const {
+    return config.find(key) != config.end() ? config.at(key) : defaultValue;
+  }
+};
+} // namespace cudaq
+
+CUDAQ_REGISTER_TYPE(cudaq::ServerHelper, cudaq::QbraidServerHelper, qbraid)
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
new file mode 100644
index 00000000000..5132a74d1a7
--- /dev/null
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
@@ -0,0 +1,30 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+name: qbraid
+description: "CUDA-Q target for qBraid."
+config:
+  # Tell DefaultQuantumPlatform what QPU subtype to use
+  platform-qpu: remote_rest
+  # Tell NVQ++ to generate glue code to set the target backend name
+  gen-target-backend: true
+  # Add the rest-qpu library to the link list
+  link-libs: ["-lcudaq-rest-qpu"]
+  # Define the lowering pipeline
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,classical-optimization-pipeline,func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,CCZToCX,CRyToCX,CRxToCX,R1AdjToR1,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(memtoreg{quantum=0}),symbol-dce"
+  # Tell the rest-qpu that we are generating OpenQASM.
+  codegen-emission: qasm2
+  # Library mode is only for simulators, physical backends must turn this off
+  library-mode: false
+
+target-arguments:
+  - key: machine
+    required: false
+    type: string
+    platform-arg: qpu 
+    help-string: "Specify the qBraid QPU."
\ No newline at end of file
diff --git a/targettests/execution/cudaq_observe-cpp17.cpp b/targettests/execution/cudaq_observe-cpp17.cpp
new file mode 100644
index 00000000000..ffd05d7780f
--- /dev/null
+++ b/targettests/execution/cudaq_observe-cpp17.cpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++17
+// clang-format off
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <cudaq/algorithm.h>
+
+// The example here shows a simple use case for the `cudaq::observe`
+// function in computing expected values of provided spin_ops.
+
+struct ansatz {
+  auto operator()(double theta) __qpu__ {
+    cudaq::qvector q(2);
+    x(q[0]);
+    ry(theta, q[1]);
+    cx(q[1], q[0]);
+  }
+};
+
+int main() {
+
+  // Build up your spin op algebraically
+   cudaq::spin_op h = 5.907 - 2.1433 * cudaq::spin_op::x(0) * cudaq::spin_op::x(1) -
+                     2.1433 * cudaq::spin_op::y(0) * cudaq::spin_op::y(1) +
+                     .21829 * cudaq::spin_op::z(0) - 6.125 * cudaq::spin_op::z(1);
+
+  // Make repeatable for shots-based emulation
+  cudaq::set_random_seed(13);
+
+  // Observe takes the kernel, the spin_op, and the concrete
+  // parameters for the kernel
+  double energy = cudaq::observe(ansatz{}, h, .59);
+  printf("Energy is %.16lf\n", energy);
+  return 0;
+}
+
+// Note: seeds 2 and 12 will push this to -2 instead of -1. All all other
+// seeds in 1-100 range will be -1.x.
+
+// CHECK: Energy is -1.
diff --git a/targettests/qbraid/bug_qubit.cpp b/targettests/qbraid/bug_qubit.cpp
new file mode 100644
index 00000000000..2179c9f4da1
--- /dev/null
+++ b/targettests/qbraid/bug_qubit.cpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// This code is from Issue 251.
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// RUN: cudaq-quake %cpp_std %s | cudaq-opt --promote-qubit-allocation | FileCheck --check-prefixes=MLIR %s
+
+#include <cudaq.h>
+#include <iostream>
+
+struct simple_x {
+  void operator()() __qpu__ {
+    cudaq::qubit q;
+    x(q);
+    mz(q);
+  }
+};
+
+// MLIR-LABEL:   func.func @__nvqpp__mlirgen__simple_x()
+// MLIR-NOT:       quake.alloca !quake.ref
+// MLIR:           %[[VAL_0:.*]] = quake.alloca !quake.veq<1>
+// MLIR-NEXT:      %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<1>) -> !quake.ref
+
+int main() {
+  auto result = cudaq::sample(simple_x{});
+
+#ifndef SYNTAX_CHECK
+  std::cout << result.most_probable() << '\n';
+  assert("1" == result.most_probable());
+#endif
+
+  return 0;
+}
+
+// CHECK: 1
diff --git a/targettests/qbraid/callable_kernel_arg.cpp b/targettests/qbraid/callable_kernel_arg.cpp
new file mode 100644
index 00000000000..759469537e7
--- /dev/null
+++ b/targettests/qbraid/callable_kernel_arg.cpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then  nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void bar(cudaq::qubit &q) { x(q); }
+
+struct baz {
+  __qpu__ void operator()(cudaq::qubit &q) { x(q); }
+};
+
+struct foo {
+  template <typename CallableKernel>
+  __qpu__ void operator()(CallableKernel &&func, int size) {
+    cudaq::qvector q(size);
+    func(q[0]);
+    auto result = mz(q[0]);
+  }
+};
+
+int main() {
+  auto result = cudaq::sample(1000, foo{}, baz{}, /*qreg size*/ 1);
+
+#ifndef SYNTAX_CHECK
+  std::cout << result.most_probable() << '\n';
+  assert("1" == result.most_probable());
+#endif
+
+  return 0;
+}
+
+// CHECK: 1
diff --git a/targettests/qbraid/cudaq_observe.cpp b/targettests/qbraid/cudaq_observe.cpp
new file mode 100644
index 00000000000..d9d1c537d85
--- /dev/null
+++ b/targettests/qbraid/cudaq_observe.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++20
+// clang-format off
+// RUN: nvq++ %cpp_std --target infleqtion      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// 2 different IQM machines for 2 different topologies
+// RUN: nvq++ --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// clang-format on
+
+#include <cudaq.h>
+#include <cudaq/algorithm.h>
+
+// The example here shows a simple use case for the `cudaq::observe`
+// function in computing expected values of provided spin_ops.
+
+struct ansatz {
+  auto operator()(double theta) __qpu__ {
+    cudaq::qvector q(2);
+    x(q[0]);
+    ry(theta, q[1]);
+    x<cudaq::ctrl>(q[1], q[0]);
+  }
+};
+
+int main() {
+
+  // Build up your spin op algebraically
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+
+  // Make repeatable for shots-based emulation
+  cudaq::set_random_seed(13);
+
+  // Observe takes the kernel, the spin_op, and the concrete
+  // parameters for the kernel
+  double energy = cudaq::observe(ansatz{}, h, .59);
+  printf("Energy is %.16lf\n", energy);
+  return 0;
+}
+
+// Note: seeds 2 and 12 will push this to -2 instead of -1. All other seeds in
+// 1-100 range will be -1.x.
+
+// CHECK: Energy is -1.
diff --git a/targettests/qbraid/if_jit.cpp b/targettests/qbraid/if_jit.cpp
new file mode 100644
index 00000000000..5719dc5b770
--- /dev/null
+++ b/targettests/qbraid/if_jit.cpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// This code is from Issue 296.
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void foo(bool value) {
+  cudaq::qubit q;
+  if (value)
+    x(q);
+
+  mz(q);
+}
+
+int main() {
+  auto result = cudaq::sample(100, foo, true);
+
+#ifndef SYNTAX_CHECK
+  std::cout << result.most_probable() << '\n';
+  assert("1" == result.most_probable());
+#endif
+
+  return 0;
+}
+
+// CHECK: 1
diff --git a/targettests/qbraid/load_value.cpp b/targettests/qbraid/load_value.cpp
new file mode 100644
index 00000000000..ab5d9cec62e
--- /dev/null
+++ b/targettests/qbraid/load_value.cpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void load_value(unsigned value) {
+  cudaq::qvector qubits(4);
+  for (std::size_t i = 0; i < 4; ++i) {
+    // Doesn't work, even with: `if (value)`
+    if (value & (1 << i))
+      x(qubits[3 - i]);
+  }
+
+  mz(qubits);
+}
+
+int main() {
+  for (auto i = 0; i < 16; ++i) {
+    auto result = cudaq::sample(1000, load_value, i);
+
+#ifndef SYNTAX_CHECK
+    std::cout << result.most_probable() << '\n';
+    assert(i == std::stoi(result.most_probable(), nullptr, 2));
+#endif
+  }
+  return 0;
+}
+
+// CHECK: 0000
+// CHECK-NEXT: 0001
+// CHECK-NEXT: 0010
+// CHECK-NEXT: 0011
+// CHECK-NEXT: 0100
+// CHECK-NEXT: 0101
+// CHECK-NEXT: 0110
+// CHECK-NEXT: 0111
+// CHECK-NEXT: 1000
+// CHECK-NEXT: 1001
+// CHECK-NEXT: 1010
+// CHECK-NEXT: 1011
+// CHECK-NEXT: 1100
+// CHECK-NEXT: 1101
+// CHECK-NEXT: 1110
+// CHECK-NEXT: 1111
diff --git a/targettests/qbraid/sudoku_2x2-1.cpp b/targettests/qbraid/sudoku_2x2-1.cpp
new file mode 100644
index 00000000000..cd028025a0c
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-1.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++20
+// clang-format off
+// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <algorithm>
+#include <cudaq.h>
+#include <iostream>
+#include <unordered_set>
+
+__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
+  h(qubits);
+  x(qubits);
+  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
+  x(qubits);
+  h(qubits);
+}
+
+__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
+  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
+  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
+}
+
+__qpu__ void grover() {
+  cudaq::qvector qubits(4);
+  cudaq::qubit ancilla;
+
+  // Initialization
+  x(ancilla);
+  h(ancilla);
+  h(qubits); // uniform initialization
+
+  // Don't work?:
+  for (int i = 0; i < 2; ++i) {
+    oracle(qubits, ancilla);
+    reflect_uniform(qubits);
+  }
+
+  mz(qubits);
+};
+
+int main() {
+  auto result = cudaq::sample(1000, grover);
+
+#ifndef SYNTAX_CHECK
+  std::vector<std::string> strings;
+  for (auto &&[bits, count] : result) {
+    strings.push_back(bits);
+  }
+  std::sort(strings.begin(), strings.end(), [&](auto &a, auto &b) {
+    return result.count(a) > result.count(b);
+  });
+  std::cout << strings[0] << '\n';
+  std::cout << strings[1] << '\n';
+
+  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
+  assert(most_probable.count("1001") == 1);
+  assert(most_probable.count("0110") == 1);
+#endif
+
+  return 0;
+}
+
+// CHECK-DAG: 1001
+// CHECK-DAG: 0110
diff --git a/targettests/qbraid/sudoku_2x2-bit_names.cpp b/targettests/qbraid/sudoku_2x2-bit_names.cpp
new file mode 100644
index 00000000000..ef53021b359
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-bit_names.cpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++20
+// clang-format off
+// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <algorithm>
+#include <iostream>
+#include <unordered_set>
+
+__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
+  h(qubits);
+  x(qubits);
+  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
+  x(qubits);
+  h(qubits);
+}
+
+__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
+  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
+  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
+}
+
+__qpu__ void grover() {
+  cudaq::qvector qubits(4);
+  cudaq::qubit ancilla;
+
+  // Initialization
+  x(ancilla);
+  h(ancilla);
+  h(qubits); // uniform initialization
+
+  oracle(qubits, ancilla);
+  reflect_uniform(qubits);
+  oracle(qubits, ancilla);
+  reflect_uniform(qubits);
+
+  auto groverQubits0 = mz(qubits[0]);
+  auto groverQubits1 = mz(qubits[1]);
+  auto groverQubits2 = mz(qubits[2]);
+  auto groverQubits3 = mz(qubits[3]);
+};
+
+int main() {
+  auto result = cudaq::sample(1000, grover);
+  result.dump();
+
+  auto& platform = cudaq::get_platform();
+  if (platform.is_remote() || platform.is_emulated()) {
+    // Make sure that the get_marginal() results for the individual register names
+    // match the subset of the bits from the global register.
+    // Note that this will fail if you only compile this in library mode.
+    auto numBits = result.begin()->first.size();
+    std::cout << "Checking " << numBits << " bits against global register\n";
+    for (size_t b = 0;  b < numBits; b++) {
+      auto regName = "groverQubits" + std::to_string(b);
+      auto valFromRegName = result.get_marginal({0}, regName);
+      auto valFromGlobal = result.get_marginal({b});
+      if (valFromRegName.to_map() != valFromGlobal.to_map()) {
+        std::cout << "--- MISMATCH DETECTED in bit " << b << " ---\n";
+        valFromRegName.dump();
+        valFromGlobal.dump();
+        // Mark test failure
+        assert(valFromRegName.to_map() == valFromGlobal.to_map());
+      }
+    }
+  }
+
+#ifndef SYNTAX_CHECK
+  std::vector<std::string> strings;
+  for (auto &&[bits, count] : result) {
+    strings.push_back(bits);
+  }
+  std::sort(strings.begin(), strings.end(), [&](auto& a, auto& b) {
+    return result.count(a) > result.count(b);
+  });
+  std::cout << strings[0] << '\n';
+  std::cout << strings[1] << '\n';
+
+  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
+  assert(most_probable.count("1001") == 1);
+  assert(most_probable.count("0110") == 1);
+#endif
+
+  return 0;
+}
+
+// CHECK-DAG: 1001
+// CHECK-DAG: 0110
diff --git a/targettests/qbraid/sudoku_2x2-reg_name.cpp b/targettests/qbraid/sudoku_2x2-reg_name.cpp
new file mode 100644
index 00000000000..6200c1070f7
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-reg_name.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++20
+// clang-format off
+// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <algorithm>
+#include <iostream>
+#include <unordered_set>
+
+__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
+  h(qubits);
+  x(qubits);
+  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
+  x(qubits);
+  h(qubits);
+}
+
+__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
+  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
+  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
+}
+
+__qpu__ void grover() {
+  cudaq::qvector qubits(4);
+  cudaq::qubit ancilla;
+
+  // Initialization
+  x(ancilla);
+  h(ancilla);
+  h(qubits); // uniform initialization
+
+  oracle(qubits, ancilla);
+  reflect_uniform(qubits);
+  oracle(qubits, ancilla);
+  reflect_uniform(qubits);
+
+  auto groverQubits = mz(qubits);
+};
+
+int main() {
+  auto result = cudaq::sample(1000, grover);
+  result.dump();
+
+#ifndef SYNTAX_CHECK
+  std::vector<std::string> strings;
+  for (auto &&[bits, count] : result) {
+    strings.push_back(bits);
+  }
+  std::sort(strings.begin(), strings.end(), [&](auto& a, auto& b) {
+    return result.count(a) > result.count(b);
+  });
+  std::cout << strings[0] << '\n';
+  std::cout << strings[1] << '\n';
+
+  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
+  assert(most_probable.count("1001") == 1);
+  assert(most_probable.count("0110") == 1);
+#endif
+
+  return 0;
+}
+
+// CHECK-DAG: 1001
+// CHECK-DAG: 0110
diff --git a/targettests/qbraid/sudoku_2x2.cpp b/targettests/qbraid/sudoku_2x2.cpp
new file mode 100644
index 00000000000..e3d4bc2c0c3
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2.cpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// REQUIRES: c++20
+// clang-format off
+// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+#include <algorithm>
+#include <iostream>
+#include <unordered_set>
+
+__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
+  h(qubits);
+  x(qubits);
+  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
+  x(qubits);
+  h(qubits);
+}
+
+__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
+  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
+  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
+}
+
+__qpu__ void grover() {
+  cudaq::qvector qubits(4);
+  cudaq::qubit ancilla;
+
+  // Initialization
+  x(ancilla);
+  h(ancilla);
+  h(qubits); // uniform initialization
+
+  oracle(qubits, ancilla);
+  reflect_uniform(qubits);
+  oracle(qubits, ancilla);
+  reflect_uniform(qubits);
+
+  mz(qubits);
+};
+
+int main() {
+  auto result = cudaq::sample(1000, grover);
+
+#ifndef SYNTAX_CHECK
+  std::vector<std::string> strings;
+  for (auto &&[bits, count] : result) {
+    strings.push_back(bits);
+  }
+  std::sort(strings.begin(), strings.end(), [&](auto& a, auto& b) {
+    return result.count(a) > result.count(b);
+  });
+  std::cout << strings[0] << '\n';
+  std::cout << strings[1] << '\n';
+
+  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
+  assert(most_probable.count("1001") == 1);
+  assert(most_probable.count("0110") == 1);
+#endif
+
+  return 0;
+}
+
+// CHECK-DAG: 1001
+// CHECK-DAG: 0110
diff --git a/targettests/qbraid/swap_gate.cpp b/targettests/qbraid/swap_gate.cpp
new file mode 100644
index 00000000000..4f37edae871
--- /dev/null
+++ b/targettests/qbraid/swap_gate.cpp
@@ -0,0 +1,43 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t && %t | FileCheck %s
+
+#include "cudaq.h"
+#include <iostream>
+
+int main() {
+
+  auto swapKernel = []() __qpu__ {
+    cudaq::qvector q(2);
+    x(q[0]);
+    swap(q[0], q[1]);
+
+    mz(q);
+  };
+
+  auto counts = cudaq::sample(swapKernel);
+
+#ifndef SYNTAX_CHECK
+  std::cout << counts.most_probable() << '\n';
+  assert("01" == counts.most_probable());
+#endif
+
+  return 0;
+}
+
+// CHECK: 01
diff --git a/targettests/qbraid/test-int8_t.cpp b/targettests/qbraid/test-int8_t.cpp
new file mode 100644
index 00000000000..7178f6c57bb
--- /dev/null
+++ b/targettests/qbraid/test-int8_t.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+struct variable_qreg {
+  __qpu__ void operator()(std::uint8_t value) {
+    cudaq::qvector qubits(value);
+
+    mz(qubits);
+  }
+};
+
+int main() {
+  for (auto i = 1; i < 5; ++i) {
+    auto result = cudaq::sample(1000, variable_qreg{}, i);
+
+#ifndef SYNTAX_CHECK
+    std::cout << result.most_probable() << '\n';
+    assert(std::string(i, '0') == result.most_probable());
+#endif
+  }
+
+  return 0;
+}
+
+// CHECK: 0
+// CHECK: 00
+// CHECK: 000
+// CHECK: 0000
diff --git a/targettests/qbraid/test-int8_t_free_func.cpp b/targettests/qbraid/test-int8_t_free_func.cpp
new file mode 100644
index 00000000000..ca9db25ec6c
--- /dev/null
+++ b/targettests/qbraid/test-int8_t_free_func.cpp
@@ -0,0 +1,46 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void variable_qreg(std::uint8_t value) {
+  cudaq::qvector qubits(value);
+
+  mz(qubits);
+}
+
+int main() {
+  for (auto i = 1; i < 5; ++i) {
+    auto result = cudaq::sample(1000, variable_qreg, i);
+
+#ifndef SYNTAX_CHECK
+    std::cout << result.most_probable() << '\n';
+    assert(std::string(i, '0') == result.most_probable());
+#endif
+  }
+
+  return 0;
+}
+
+// CHECK: 0
+// CHECK-NEXT: 00
+// CHECK-NEXT: 000
+// CHECK-NEXT: 0000
diff --git a/targettests/qbraid/variable_size_qreg.cpp b/targettests/qbraid/variable_size_qreg.cpp
new file mode 100644
index 00000000000..1f6c139a085
--- /dev/null
+++ b/targettests/qbraid/variable_size_qreg.cpp
@@ -0,0 +1,46 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+__qpu__ void variable_qreg(unsigned value) {
+  cudaq::qvector qubits(value);
+
+  mz(qubits);
+}
+
+int main() {
+  for (auto i = 1; i < 5; ++i) {
+    auto result = cudaq::sample(1000, variable_qreg, i);
+
+#ifndef SYNTAX_CHECK
+    std::cout << result.most_probable() << '\n';
+    assert(std::string(i, '0') == result.most_probable());
+#endif
+  }
+
+  return 0;
+}
+
+// CHECK: 0
+// CHECK-NEXT: 00
+// CHECK-NEXT: 000
+// CHECK-NEXT: 0000
diff --git a/tpls/Stim b/tpls/Stim
index 42e0b9e0991..47190f4a3af 160000
--- a/tpls/Stim
+++ b/tpls/Stim
@@ -1 +1 @@
-Subproject commit 42e0b9e099180e8570407c33f87b4683cac00d81
+Subproject commit 47190f4a3afb104c9f0068d0be9fea87d2894a70
diff --git a/tpls/cpr b/tpls/cpr
index d202b82fbcc..871ed52d350 160000
--- a/tpls/cpr
+++ b/tpls/cpr
@@ -1 +1 @@
-Subproject commit d202b82fbccf897604a18e035c09e1330dffd082
+Subproject commit 871ed52d350214a034f6ef8a3b8f51c5ce1bd400
diff --git a/tpls/fmt b/tpls/fmt
index fc8d07cfe54..ba50c19e827 160000
--- a/tpls/fmt
+++ b/tpls/fmt
@@ -1 +1 @@
-Subproject commit fc8d07cfe54ba9f5019453dfdb112491246ee017
+Subproject commit ba50c19e827383bd5dacb74189fb4852c8dcbdae
diff --git a/tpls/spdlog b/tpls/spdlog
index 287333ee005..edc51df1bda 160000
--- a/tpls/spdlog
+++ b/tpls/spdlog
@@ -1 +1 @@
-Subproject commit 287333ee00555aaece5a5cf6acc9040563c6f642
+Subproject commit edc51df1bdad8667b628999394a1e7c4dc6f3658
diff --git a/unittests/backends/CMakeLists.txt b/unittests/backends/CMakeLists.txt
index ed42c11cc55..627ae6a7395 100644
--- a/unittests/backends/CMakeLists.txt
+++ b/unittests/backends/CMakeLists.txt
@@ -8,15 +8,15 @@
 
 # List of libraries to link with by default to create a test executable
 set(default_backend_unittest_libs
-  fmt::fmt-header-only 
-  cudaq-common 
+  fmt::fmt-header-only
+  cudaq-common
   cudaq
   cudaq-builder
   cudaq-mlir-runtime
   cudaq-rest-qpu
   cudaq-operator
   nvqir nvqir-qpp
-  cudaq-platform-default 
+  cudaq-platform-default
   gtest_main)
 
 define_property(DIRECTORY PROPERTY BACKEND_UNITTEST_LIBS INHERITED
@@ -32,12 +32,12 @@ set_property(DIRECTORY PROPERTY BACKEND_UNITTEST_LIBS ${default_backend_unittest
 # Helper function to create an executable to be used by the gtest unit tests
 # - target: positional argument, name of the executable
 # - BACKEND: named argument to specify a prefix for the test names
-# - BACKEND_CONFIG: if present, the test will set NVQPP_TARGET_BACKEND_CONFIG 
+# - BACKEND_CONFIG: if present, the test will set NVQPP_TARGET_BACKEND_CONFIG
 #     with this value so the backend gets loaded by a constructor before entering main.
 #     To avoid issues with semicolon the format is: backend key1=value1 key2=value2
 #     The function will convert this to           : backend;key1;value1;key2;value2
 #     Example:     infleqtion emulate=false url=http://localhost:62447
-# - LINK_LIBS: optional argument to provide non-default list of libraries to link with 
+# - LINK_LIBS: optional argument to provide non-default list of libraries to link with
 function(add_backend_unittest_executable target)
   set(singleValues BACKEND BACKEND_CONFIG)
   set(multiValues SOURCES INCLUDES LINK_LIBS)
@@ -97,6 +97,9 @@ if (OPENSSL_FOUND AND CUDAQ_ENABLE_PYTHON AND CUDAQ_TEST_MOCK_SERVERS)
   if (CUDAQ_ENABLE_SCALEWAY_BACKEND)
     add_subdirectory(scaleway)
   endif()
+  if (CUDAQ_ENABLE_QBRAID_BACKEND)
+    add_subdirectory(qbraid)
+  endif()
   add_subdirectory(extra_payload_provider)
   add_subdirectory(quake_backend)
 endif()
diff --git a/unittests/backends/qbraid/CMakeLists.txt b/unittests/backends/qbraid/CMakeLists.txt
new file mode 100644
index 00000000000..05ca3c19550
--- /dev/null
+++ b/unittests/backends/qbraid/CMakeLists.txt
@@ -0,0 +1,27 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+add_executable(test_qbraid QbraidTester.cpp)
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+  target_link_options(test_qbraid PRIVATE -Wl,--no-as-needed)
+endif()
+target_compile_definitions(test_qbraid PRIVATE -DNVQIR_BACKEND_NAME=qbraid)
+target_include_directories(test_qbraid PRIVATE ../..)
+target_link_libraries(test_qbraid
+  PRIVATE fmt::fmt-header-only
+  cudaq-common
+  cudaq
+  cudaq-builder
+  cudaq-mlir-runtime
+  cudaq-rest-qpu
+  cudaq-platform-default
+  gtest_main)
+
+
+configure_file("QbraidStartServerAndTest.sh.in" "${CMAKE_BINARY_DIR}/unittests/backends/qbraid/QbraidStartServerAndTest.sh" @ONLY)
+add_test(NAME qbraid-tests COMMAND bash QbraidStartServerAndTest.sh WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/unittests/backends/qbraid/)
diff --git a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
new file mode 100644
index 00000000000..8ba8b822945
--- /dev/null
+++ b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+checkServerConnection() {
+  PYTHONPATH=@CMAKE_BINARY_DIR@/python @Python_EXECUTABLE@ - << EOF
+import socket
+try:
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.connect(("localhost", 62449))
+    s.close()
+except Exception:
+    exit(1)
+EOF
+}
+
+# Launch the fake server
+PYTHONPATH=@CMAKE_BINARY_DIR@/python @Python_EXECUTABLE@ @CMAKE_SOURCE_DIR@/utils/mock_qpu/qbraid/__init__.py &
+# we'll need the process id to kill it
+pid=$(echo "$!")
+n=0
+while ! checkServerConnection; do
+  sleep 1
+  n=$((n+1))
+  if [ "$n" -eq "10" ]; then
+    kill -INT $pid
+    exit 99
+  fi
+done
+# Run the tests
+./test_qbraid
+# Did they fail? 
+testsPassed=$?
+# kill the server
+kill -INT $pid
+# return success / failure
+exit $testsPassed
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
new file mode 100644
index 00000000000..9046199e798
--- /dev/null
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -0,0 +1,177 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CUDAQTestUtils.h"
+#include "common/FmtCore.h"
+#include "cudaq/algorithm.h"
+#include <fstream>
+#include <gtest/gtest.h>
+#include <stdlib.h>
+
+// Update the backend string to match the QBraid format
+std::string mockPort = "62449";
+std::string backendStringTemplate =
+    "qbraid;emulate;false;url;http://localhost:{}";
+
+bool isValidExpVal(double value) {
+  // give us some wiggle room while keep the tests fast
+  return value < -1.1 && value > -2.3;
+}
+
+CUDAQ_TEST(QbraidTester, checkSampleSync) {
+  auto backendString =
+      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
+
+  auto &platform = cudaq::get_platform();
+  platform.setTargetBackend(backendString);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto counts = cudaq::sample(kernel);
+  counts.dump();
+  EXPECT_EQ(counts.size(), 2);
+}
+
+CUDAQ_TEST(QbraidTester, checkSampleAsync) {
+  auto backendString =
+      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
+
+  auto &platform = cudaq::get_platform();
+  platform.setTargetBackend(backendString);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto future = cudaq::sample_async(kernel);
+  auto counts = future.get();
+  EXPECT_EQ(counts.size(), 2);
+}
+
+CUDAQ_TEST(QbraidTester, checkSampleAsyncLoadFromFile) {
+  auto backendString =
+      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
+
+  auto &platform = cudaq::get_platform();
+  platform.setTargetBackend(backendString);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto future = cudaq::sample_async(kernel);
+  {
+    std::ofstream out("saveMe.json");
+    out << future;
+  }
+
+  cudaq::async_result<cudaq::sample_result> readIn;
+  std::ifstream in("saveMe.json");
+  in >> readIn;
+
+  auto counts = readIn.get();
+  EXPECT_EQ(counts.size(), 2);
+
+  std::remove("saveMe.json");
+}
+
+CUDAQ_TEST(QbraidTester, checkObserveSync) {
+  auto backendString =
+      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
+
+  auto &platform = cudaq::get_platform();
+  platform.setTargetBackend(backendString);
+
+  auto [kernel, theta] = cudaq::make_kernel<double>();
+  auto qubit = kernel.qalloc(2);
+  kernel.x(qubit[0]);
+  kernel.ry(theta, qubit[1]);
+  kernel.x<cudaq::ctrl>(qubit[1], qubit[0]);
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  auto result = cudaq::observe(kernel, h, .59);
+  result.dump();
+
+  printf("ENERGY: %lf\n", result.expectation());
+  EXPECT_TRUE(isValidExpVal(result.expectation()));
+}
+
+CUDAQ_TEST(QbraidTester, checkObserveAsync) {
+  auto backendString =
+      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
+
+  auto &platform = cudaq::get_platform();
+  platform.setTargetBackend(backendString);
+
+  auto [kernel, theta] = cudaq::make_kernel<double>();
+  auto qubit = kernel.qalloc(2);
+  kernel.x(qubit[0]);
+  kernel.ry(theta, qubit[1]);
+  kernel.x<cudaq::ctrl>(qubit[1], qubit[0]);
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  auto future = cudaq::observe_async(kernel, h, .59);
+
+  auto result = future.get();
+  result.dump();
+
+  printf("ENERGY: %lf\n", result.expectation());
+  EXPECT_TRUE(isValidExpVal(result.expectation()));
+}
+
+CUDAQ_TEST(QbraidTester, checkObserveAsyncLoadFromFile) {
+  auto backendString =
+      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
+
+  auto &platform = cudaq::get_platform();
+  platform.setTargetBackend(backendString);
+
+  auto [kernel, theta] = cudaq::make_kernel<double>();
+  auto qubit = kernel.qalloc(2);
+  kernel.x(qubit[0]);
+  kernel.ry(theta, qubit[1]);
+  kernel.x<cudaq::ctrl>(qubit[1], qubit[0]);
+
+  using namespace cudaq::spin;
+  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
+                     .21829 * z(0) - 6.125 * z(1);
+  auto future = cudaq::observe_async(kernel, h, .59);
+
+  {
+    std::ofstream out("saveMeObserve.json");
+    out << future;
+  }
+
+  cudaq::async_result<cudaq::observe_result> readIn(&h);
+  std::ifstream in("saveMeObserve.json");
+  in >> readIn;
+
+  auto result = readIn.get();
+
+  std::remove("saveMeObserve.json");
+  result.dump();
+
+  printf("ENERGY: %lf\n", result.expectation());
+  EXPECT_TRUE(isValidExpVal(result.expectation()));
+}
+
+int main(int argc, char **argv) {
+  setenv("QBRAID_API_KEY", "00000000000000000000000000000000", 0);
+  ::testing::InitGoogleTest(&argc, argv);
+  auto ret = RUN_ALL_TESTS();
+  return ret;
+}
\ No newline at end of file
diff --git a/utils/mock_qpu/__init__.py b/utils/mock_qpu/__init__.py
index 8167902c1e1..c508a32c796 100644
--- a/utils/mock_qpu/__init__.py
+++ b/utils/mock_qpu/__init__.py
@@ -21,6 +21,7 @@
     "qci": 62449,
     "scaleway": 62450,
     "tii": 62451,
+    "qbraid": 62452,
 }
 
 
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
new file mode 100644
index 00000000000..9d5bae322c1
--- /dev/null
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -0,0 +1,240 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import itertools
+import random
+import re
+import uuid
+from typing import Any, Optional
+
+import uvicorn
+from fastapi import FastAPI, Header, HTTPException, Query
+from pydantic import BaseModel
+
+app = FastAPI()
+
+
+class Job(BaseModel):
+    """Data required to submit a quantum job."""
+
+    openQasm: str
+    shots: int
+    qbraidDeviceId: str
+
+
+JOBS_MOCK_DB = {}
+JOBS_MOCK_RESULTS = {}
+
+
+def count_qubits(qasm: str) -> int:
+    """Extracts the number of qubits from an OpenQASM string."""
+    pattern = r"qreg\s+\w+\[(\d+)\];"
+
+    match = re.search(pattern, qasm)
+
+    if match:
+        return int(match.group(1))
+
+    raise ValueError("No qreg declaration found in the OpenQASM string.")
+
+
+# def simulate_job(qasm: str, num_shots: int) -> dict[str, int]:
+#     """Simulates a quantum job by generating random measurement outcomes."""
+#     num_qubits = count_qubits(qasm)
+
+#     all_states = ["".join(p) for p in itertools.product("01", repeat=num_qubits)]
+#     num_states_to_select = random.randint(1, len(all_states))
+#     selected_states = random.sample(all_states, num_states_to_select)
+#     distribution = random.choices(selected_states, k=num_shots)
+
+#     result = {state: distribution.count(state) for state in selected_states}
+
+#     return result
+
+
+def simulate_job(qasm: str, num_shots: int) -> dict[str, int]:
+    """Simulates a quantum job by generating random measurement outcomes based on the circuit."""
+    num_qubits = count_qubits(qasm)
+
+    measured_qubits = []
+
+    measure_pattern = r"measure\s+(\w+)\[(\d+)\]"
+    measure_matches = re.findall(measure_pattern, qasm)
+
+    hadamard_pattern = r"h\s+(\w+)\[(\d+)\]"
+    hadamard_matches = re.findall(hadamard_pattern, qasm)
+
+    superposition_qubits = set()
+    for _, qubit_idx in hadamard_matches:
+        superposition_qubits.add(int(qubit_idx))
+
+    for _, qubit_idx in measure_matches:
+        measured_qubits.append(int(qubit_idx))
+
+    if not measured_qubits:
+        measured_qubits = list(range(num_qubits))
+
+    result = {}
+
+    possible_states = []
+
+    if measured_qubits:
+        # Generate strings of the appropriate length for measured qubits
+        # For superposition qubits, include both 0 and 1 outcomes
+        for measured_qubit in measured_qubits:
+            if measured_qubit in superposition_qubits:
+                if not possible_states:
+                    possible_states = ["0", "1"]
+                else:
+                    new_states = []
+                    for state in possible_states:
+                        new_states.append(state + "0")
+                        new_states.append(state + "1")
+                    possible_states = new_states
+            else:
+                if not possible_states:
+                    possible_states = ["0"]
+                else:
+                    possible_states = [state + "0" for state in possible_states]
+
+    if not possible_states:
+        if superposition_qubits:
+            possible_states = ["0", "1"]
+        else:
+            possible_states = ["0" * num_qubits]
+
+    distribution = random.choices(possible_states, k=num_shots)
+    result = {state: distribution.count(state) for state in set(distribution)}
+
+    if (
+        num_qubits == 2
+        and len(measured_qubits) == 1
+        and measured_qubits[0] == 0
+        and 0 in superposition_qubits
+    ):
+        new_result = {}
+        total_shots = num_shots
+        half_shots = total_shots // 2
+
+        new_result["00"] = random.randint(
+            half_shots - half_shots // 4, half_shots + half_shots // 4
+        )
+        new_result["01"] = 0
+        new_result["10"] = random.randint(
+            half_shots - half_shots // 4, half_shots + half_shots // 4
+        )
+        new_result["11"] = 0
+
+        remaining = total_shots - (new_result["00"] + new_result["10"])
+        if remaining > 0:
+            new_result["00"] += remaining
+
+        result = {k: v for k, v in new_result.items() if v > 0}
+
+    return result
+
+
+def poll_job_status(job_id: str) -> dict[str, Any]:
+    """Updates the status of a job and returns the updated job data."""
+    if job_id not in JOBS_MOCK_DB:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    status = JOBS_MOCK_DB[job_id]["status"]
+
+    status_transitions = {
+        "INITIALIZING": "QUEUED",
+        "QUEUED": "RUNNING",
+        "RUNNING": "COMPLETED",
+        "CANCELLING": "CANCELLED",
+    }
+
+    new_status = status_transitions.get(status, status)
+    JOBS_MOCK_DB[job_id]["status"] = new_status
+
+    return {"qbraidJobId": job_id, **JOBS_MOCK_DB[job_id]}
+
+
+@app.post("/quantum-jobs")
+async def postJob(job: Job, api_key: Optional[str] = Header(None, alias="api-key")):
+    """Submit a quantum job for execution."""
+    if api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    newId = str(uuid.uuid4())
+
+    counts = simulate_job(job.openQasm, job.shots)
+
+    job_data = {"status": "INITIALIZING", "statusText": "", **job.model_dump()}
+
+    JOBS_MOCK_DB[newId] = job_data
+    JOBS_MOCK_RESULTS[newId] = counts
+
+    return {"qbraidJobId": newId, **job_data}
+
+
+@app.get("/quantum-jobs")
+async def getJobs(
+    job_id: Optional[str] = Query(None, alias="qbraidJobId"),
+    api_key: Optional[str] = Header(None, alias="api-key"),
+):
+    """Retrieve the status of one or more quantum jobs."""
+    if api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    jobs_array = []
+    if job_id is None:
+        for job in JOBS_MOCK_DB:
+            job_data = poll_job_status(job)
+            jobs_array.append(job_data)
+    else:
+        job_data = poll_job_status(job_id)
+        jobs_array.append(job_data)
+
+    res = {"jobsArray": jobs_array, "total": len(jobs_array)}
+
+    return res
+
+
+@app.get("/quantum-jobs/result/{job_id}")
+async def getJobResult(job_id: str, api_key: Optional[str] = Header(None, alias="api-key")):
+    """Retrieve the results of a quantum job."""
+    if api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
+
+    if job_id not in JOBS_MOCK_DB:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    if JOBS_MOCK_DB[job_id]["status"] in {"FAILED", "CANCELLED"}:
+        raise HTTPException(
+            status_code=409, detail="Results unavailable. Job failed or was cancelled."
+        )
+
+    if JOBS_MOCK_DB[job_id]["status"] != "COMPLETED":
+        return {
+            "error": "Job still in progress. Results will be available once job is completed.",
+            "data": {},
+        }
+
+    if job_id not in JOBS_MOCK_RESULTS:
+        raise HTTPException(status_code=500, detail="Job results not found")
+
+    if random.random() < 0.2:
+        return {"error": "Failed to retrieve job results. Please wait, and try again.", "data": {}}
+
+    counts = JOBS_MOCK_RESULTS[job_id]
+
+    return {"data": {"measurementCounts": counts}}
+
+
+def startServer(port):
+    """Start the REST server."""
+    uvicorn.run(app, port=port, host="0.0.0.0", log_level="info")
+
+
+if __name__ == "__main__":
+    startServer(62449)

From 925ae39eebd02886afd9415a9546b1f74fc65d15 Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Fri, 13 Mar 2026 13:27:56 +0530
Subject: [PATCH 02/85] update: migrate cudaq to platform v2

---
 .../helpers/qbraid/QbraidServerHelper.cpp     | 151 +++++++++---------
 .../qbraid/QbraidStartServerAndTest.sh.in     |   4 +-
 unittests/backends/qbraid/QbraidTester.cpp    |   2 +-
 utils/mock_qpu/qbraid/__init__.py             | 136 ++++++++++------
 4 files changed, 164 insertions(+), 129 deletions(-)

diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index 5e2bf74787e..5e930c0f2da 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -3,16 +3,13 @@
 #include "common/ServerHelper.h"
 #include "cudaq/Support/Version.h"
 #include "cudaq/utils/cudaq_utils.h"
-#include <bitset>
-#include <fstream>
-#include <map>
 #include <thread>
 
 namespace cudaq {
 
 class QbraidServerHelper : public ServerHelper {
-  static constexpr const char *DEFAULT_URL = "https://api.qbraid.com/api";
-  static constexpr const char *DEFAULT_DEVICE = "ionq_simulator";
+  static constexpr const char *DEFAULT_URL = "https://api-v2.qbraid.com/api/v1";
+  static constexpr const char *DEFAULT_DEVICE = "ionq:ionq:sim:simulator";
   static constexpr int DEFAULT_QUBITS = 29;
 
 public:
@@ -28,8 +25,7 @@ class QbraidServerHelper : public ServerHelper {
     backendConfig["qubits"] = std::to_string(DEFAULT_QUBITS);
 
     backendConfig["api_key"] = getEnvVar("QBRAID_API_KEY", "", true);
-    backendConfig["job_path"] = backendConfig["url"] + "/quantum-jobs";
-    backendConfig["results_path"] = backendConfig["url"] + "/quantum-jobs/result/";
+    backendConfig["job_path"] = backendConfig["url"] + "/jobs";
 
     backendConfig["results_output_dir"] = getValueOrDefault(config, "results_output_dir", "./qbraid_results");
     backendConfig["results_file_prefix"] = getValueOrDefault(config, "results_file_prefix", "qbraid_job_");
@@ -63,14 +59,18 @@ class QbraidServerHelper : public ServerHelper {
     std::vector<ServerMessage> jobs;
     for (auto &circuitCode : circuitCodes) {
       ServerMessage job;
-      job["qbraidDeviceId"] = backendConfig.at("device_id");
-      job["openQasm"] = circuitCode.code;
+      job["deviceQrn"] = backendConfig.at("device_id");
       job["shots"] = std::stoi(backendConfig.at("shots"));
 
+      // v2 API: program is a structured object with format and data
+      nlohmann::json program;
+      program["format"] = "qasm2";
+      program["data"] = circuitCode.code;
+      job["program"] = program;
+
+      // v2 API: name is a top-level field (not nested under tags)
       if (!circuitCode.name.empty()) {
-        nlohmann::json tags;
-        tags["name"] = circuitCode.name;
-        job["tags"] = tags;
+        job["name"] = circuitCode.name;
       }
 
       jobs.push_back(job);
@@ -80,40 +80,47 @@ class QbraidServerHelper : public ServerHelper {
   }
 
   std::string extractJobId(ServerMessage &postResponse) override {
-    if (!postResponse.contains("qbraidJobId")) {
-      throw std::runtime_error("ServerMessage doesn't contain 'qbraidJobId' key.");
+    // v2 API: jobQrn is nested under data envelope
+    if (postResponse.contains("data") && postResponse["data"].contains("jobQrn")) {
+      return postResponse["data"]["jobQrn"].get<std::string>();
     }
-    return postResponse.at("qbraidJobId");
+    throw std::runtime_error("ServerMessage doesn't contain 'data.jobQrn' key.");
   }
 
   std::string constructGetJobPath(ServerMessage &postResponse) override {
-    if (!postResponse.contains("qbraidJobId")) {
-      throw std::runtime_error("ServerMessage doesn't contain 'qbraidJobId' key.");
+    // v2 API: use path parameter instead of query parameter
+    if (postResponse.contains("data") && postResponse["data"].contains("jobQrn")) {
+      return backendConfig.at("job_path") + "/" + postResponse["data"]["jobQrn"].get<std::string>();
     }
-
-    return backendConfig.at("job_path") + "?qbraidJobId=" + postResponse.at("qbraidJobId").get<std::string>();
+    throw std::runtime_error("ServerMessage doesn't contain 'data.jobQrn' key.");
   }
 
   std::string constructGetJobPath(std::string &jobId) override {
-    return backendConfig.at("job_path") + "?qbraidJobId=" + jobId;
+    // v2 API: /jobs/{jobQrn}
+    return backendConfig.at("job_path") + "/" + jobId;
   }
 
   std::string constructGetResultsPath(const std::string &jobId) {
-    return backendConfig.at("results_path") + jobId;
+    // v2 API: /jobs/{jobQrn}/result
+    return backendConfig.at("job_path") + "/" + jobId + "/result";
+  }
+
+  std::string constructGetProgramPath(const std::string &jobId) {
+    // v2 API: /jobs/{jobQrn}/program
+    return backendConfig.at("job_path") + "/" + jobId + "/program";
   }
 
   bool jobIsDone(ServerMessage &getJobResponse) override {
     std::string status;
 
-    if (getJobResponse.contains("jobsArray") && !getJobResponse["jobsArray"].empty()) {
-      status = getJobResponse["jobsArray"][0]["status"].get<std::string>();
-      cudaq::info("Job status from jobs endpoint: {}", status);
+    // v2 API: status is nested under data envelope
+    if (getJobResponse.contains("data") && getJobResponse["data"].contains("status")) {
+      status = getJobResponse["data"]["status"].get<std::string>();
+      cudaq::info("Job status from v2 data envelope: {}", status);
     } else if (getJobResponse.contains("status")) {
+      // Fallback: direct status field
       status = getJobResponse["status"].get<std::string>();
       cudaq::info("Job status from direct response: {}", status);
-    } else if (getJobResponse.contains("data") && getJobResponse["data"].contains("status")) {
-      status = getJobResponse["data"]["status"].get<std::string>();
-      cudaq::info("Job status from data object: {}", status);
     } else {
       cudaq::info("Unexpected job response format: {}", getJobResponse.dump());
       throw std::runtime_error("Invalid job response format");
@@ -127,7 +134,26 @@ class QbraidServerHelper : public ServerHelper {
     return false;
   }
 
-  // Sample results with results api - with retry logic
+  // Fetch the original program from v2 endpoint
+  std::string getJobProgram(const ServerMessage &response, const std::string &jobId) override {
+    auto programPath = constructGetProgramPath(jobId);
+    auto headers = getHeaders();
+
+    cudaq::info("Fetching job program from v2 endpoint: {}", programPath);
+    RestClient client;
+    auto programJson = client.get("", programPath, headers, true);
+
+    // v2 API: program content at data.data, format at data.format
+    if (programJson.contains("data") && programJson["data"].contains("data")) {
+      cudaq::info("Retrieved program (format: {})",
+                  programJson["data"].value("format", "unknown"));
+      return programJson["data"]["data"].get<std::string>();
+    }
+
+    throw std::runtime_error("Invalid program response format: " + programJson.dump());
+  }
+
+  // Fetch results from v2 results endpoint with retry logic
   cudaq::sample_result processResults(ServerMessage &getJobResponse, std::string &jobId) override {
     int maxRetries = 5;
     int waitTime = 2;
@@ -138,23 +164,30 @@ class QbraidServerHelper : public ServerHelper {
         auto resultsPath = constructGetResultsPath(jobId);
         auto headers = getHeaders();
 
-        cudaq::info("Fetching results using direct endpoint (attempt {}/{}): {}", attempt + 1, maxRetries, resultsPath);
+        cudaq::info("Fetching results from v2 endpoint (attempt {}/{}): {}", attempt + 1, maxRetries, resultsPath);
         RestClient client;
         auto resultJson = client.get("", resultsPath, headers, true);
 
-        if (resultJson.contains("error") && !resultJson["error"].is_null()) {
-          std::string errorMsg = resultJson["error"].is_string()
-                                     ? resultJson["error"].get<std::string>()
-                                     : resultJson["error"].dump();
-          cudaq::info("Error from results endpoint: {}", errorMsg);
+        // v2 API: error indicated by success=false
+        if (resultJson.contains("success") && resultJson["success"].is_boolean()
+            && !resultJson["success"].get<bool>()) {
+          std::string errorMsg = "Results not yet available";
+          if (resultJson.contains("data") && resultJson["data"].contains("message")) {
+            errorMsg = resultJson["data"]["message"].get<std::string>();
+          }
+          cudaq::info("Results endpoint returned success=false: {}", errorMsg);
 
           if (attempt == maxRetries - 1) {
             throw std::runtime_error("Error retrieving results: " + errorMsg);
           }
-        } else if (resultJson.contains("data") && resultJson["data"].contains("measurementCounts")) {
-          cudaq::info("Processing results from direct endpoint");
+        }
+        // v2 API: measurementCounts nested under data.resultData
+        else if (resultJson.contains("data")
+                 && resultJson["data"].contains("resultData")
+                 && resultJson["data"]["resultData"].contains("measurementCounts")) {
+          cudaq::info("Processing results from v2 endpoint");
           CountsDictionary counts;
-          auto &measurements = resultJson["data"]["measurementCounts"];
+          auto &measurements = resultJson["data"]["resultData"]["measurementCounts"];
 
           for (const auto &[bitstring, count] : measurements.items()) {
             counts[bitstring] =
@@ -168,7 +201,7 @@ class QbraidServerHelper : public ServerHelper {
           return cudaq::sample_result(execResults);
         }
 
-        // If we get here, no valid data was found but also no error - retry
+        // No valid data yet and no explicit error - retry
         if (attempt < maxRetries - 1) {
           int sleepTime = (attempt == 0) ? waitTime : waitTime * std::pow(backoffFactor, attempt);
           cudaq::info("No valid results yet, retrying in {} seconds", sleepTime);
@@ -176,51 +209,17 @@ class QbraidServerHelper : public ServerHelper {
         }
 
       } catch (const std::exception &e) {
-        cudaq::info("Exception when using direct results endpoint: {}", e.what());
+        cudaq::info("Exception when fetching results: {}", e.what());
         if (attempt < maxRetries - 1) {
           int sleepTime = (attempt == 0) ? waitTime : waitTime * std::pow(backoffFactor, attempt);
           cudaq::info("Retrying in {} seconds", sleepTime);
           std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
-        } else {
-          cudaq::info("Falling back to original results processing method");
         }
       }
     }
 
-    // Original result processing as fallback
-    cudaq::info("Processing results from job response for job {}", jobId);
-    if (getJobResponse.contains("jobsArray") && !getJobResponse["jobsArray"].empty()) {
-      auto &job = getJobResponse["jobsArray"][0];
-
-      if (job.contains("measurementCounts")) {
-        CountsDictionary counts;
-        auto &measurements = job["measurementCounts"];
-
-        for (const auto &[bitstring, count] : measurements.items()) {
-          counts[bitstring] = count.get<std::size_t>();
-        }
-
-        std::vector<ExecutionResult> execResults;
-        execResults.emplace_back(ExecutionResult{counts});
-        return cudaq::sample_result(execResults);
-      }
-    }
-
-    // Last resort - check for direct measurementCounts in the response
-    if (getJobResponse.contains("measurementCounts")) {
-      CountsDictionary counts;
-      auto &measurements = getJobResponse["measurementCounts"];
-
-      for (const auto &[bitstring, count] : measurements.items()) {
-        counts[bitstring] = count.get<std::size_t>();
-      }
-
-      std::vector<ExecutionResult> execResults;
-      execResults.emplace_back(ExecutionResult{counts});
-      return cudaq::sample_result(execResults);
-    }
-
-    throw std::runtime_error("No measurement counts found in any response format");
+    throw std::runtime_error("Failed to retrieve measurement counts after " +
+                             std::to_string(maxRetries) + " attempts");
   }
 
   /// @brief Override the polling interval method
@@ -266,7 +265,7 @@ class QbraidServerHelper : public ServerHelper {
     }
 
     RestHeaders headers;
-    headers["api-key"] = backendConfig.at("api_key");
+    headers["X-API-KEY"] = backendConfig.at("api_key");
     headers["Content-Type"] = "application/json";
     headers["User-Agent"] = backendConfig.at("user_agent");
     return headers;
diff --git a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
index 8ba8b822945..72ec44e9433 100644
--- a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
+++ b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
@@ -13,7 +13,7 @@ checkServerConnection() {
 import socket
 try:
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.connect(("localhost", 62449))
+    s.connect(("localhost", 62452))
     s.close()
 except Exception:
     exit(1)
@@ -35,7 +35,7 @@ while ! checkServerConnection; do
 done
 # Run the tests
 ./test_qbraid
-# Did they fail? 
+# Did they fail?
 testsPassed=$?
 # kill the server
 kill -INT $pid
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index 9046199e798..e3e94fb8087 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -14,7 +14,7 @@
 #include <stdlib.h>
 
 // Update the backend string to match the QBraid format
-std::string mockPort = "62449";
+std::string mockPort = "62452";
 std::string backendStringTemplate =
     "qbraid;emulate;false;url;http://localhost:{}";
 
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
index 9d5bae322c1..1bb225a59f7 100644
--- a/utils/mock_qpu/qbraid/__init__.py
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -13,18 +13,27 @@
 from typing import Any, Optional
 
 import uvicorn
-from fastapi import FastAPI, Header, HTTPException, Query
+from fastapi import FastAPI, Header, HTTPException, Path
 from pydantic import BaseModel
 
 app = FastAPI()
 
 
+class Program(BaseModel):
+    """Structured program payload for v2 API."""
+
+    format: str
+    data: str
+
+
 class Job(BaseModel):
-    """Data required to submit a quantum job."""
+    """Data required to submit a quantum job (v2 API)."""
 
-    openQasm: str
+    program: Program
     shots: int
-    qbraidDeviceId: str
+    deviceQrn: str
+    name: Optional[str] = None
+    tags: Optional[dict] = None
 
 
 JOBS_MOCK_DB = {}
@@ -43,20 +52,6 @@ def count_qubits(qasm: str) -> int:
     raise ValueError("No qreg declaration found in the OpenQASM string.")
 
 
-# def simulate_job(qasm: str, num_shots: int) -> dict[str, int]:
-#     """Simulates a quantum job by generating random measurement outcomes."""
-#     num_qubits = count_qubits(qasm)
-
-#     all_states = ["".join(p) for p in itertools.product("01", repeat=num_qubits)]
-#     num_states_to_select = random.randint(1, len(all_states))
-#     selected_states = random.sample(all_states, num_states_to_select)
-#     distribution = random.choices(selected_states, k=num_shots)
-
-#     result = {state: distribution.count(state) for state in selected_states}
-
-#     return result
-
-
 def simulate_job(qasm: str, num_shots: int) -> dict[str, int]:
     """Simulates a quantum job by generating random measurement outcomes based on the circuit."""
     num_qubits = count_qubits(qasm)
@@ -156,54 +151,79 @@ def poll_job_status(job_id: str) -> dict[str, Any]:
     new_status = status_transitions.get(status, status)
     JOBS_MOCK_DB[job_id]["status"] = new_status
 
-    return {"qbraidJobId": job_id, **JOBS_MOCK_DB[job_id]}
+    return {"jobQrn": job_id, **JOBS_MOCK_DB[job_id]}
 
 
-@app.post("/quantum-jobs")
-async def postJob(job: Job, api_key: Optional[str] = Header(None, alias="api-key")):
-    """Submit a quantum job for execution."""
-    if api_key is None:
+# v2 API: POST /jobs
+@app.post("/jobs")
+async def postJob(job: Job, x_api_key: Optional[str] = Header(None, alias="X-API-KEY")):
+    """Submit a quantum job for execution (v2 API)."""
+    if x_api_key is None:
         raise HTTPException(status_code=401, detail="API key is required")
 
     newId = str(uuid.uuid4())
 
-    counts = simulate_job(job.openQasm, job.shots)
+    # Extract QASM from the structured program payload
+    counts = simulate_job(job.program.data, job.shots)
 
     job_data = {"status": "INITIALIZING", "statusText": "", **job.model_dump()}
 
     JOBS_MOCK_DB[newId] = job_data
     JOBS_MOCK_RESULTS[newId] = counts
 
-    return {"qbraidJobId": newId, **job_data}
+    # v2 response: wrapped in success/data envelope
+    return {"success": True, "data": {"jobQrn": newId, "status": "INITIALIZING"}}
 
 
-@app.get("/quantum-jobs")
-async def getJobs(
-    job_id: Optional[str] = Query(None, alias="qbraidJobId"),
-    api_key: Optional[str] = Header(None, alias="api-key"),
+# v2 API: GET /jobs/{job_qrn}
+@app.get("/jobs/{job_id}")
+async def getJob(
+    job_id: str = Path(...),
+    x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
 ):
-    """Retrieve the status of one or more quantum jobs."""
-    if api_key is None:
+    """Retrieve the status of a quantum job (v2 API)."""
+    if x_api_key is None:
         raise HTTPException(status_code=401, detail="API key is required")
 
-    jobs_array = []
-    if job_id is None:
-        for job in JOBS_MOCK_DB:
-            job_data = poll_job_status(job)
-            jobs_array.append(job_data)
-    else:
-        job_data = poll_job_status(job_id)
-        jobs_array.append(job_data)
+    job_data = poll_job_status(job_id)
 
-    res = {"jobsArray": jobs_array, "total": len(jobs_array)}
+    # v2 response: wrapped in success/data envelope
+    return {"success": True, "data": job_data}
 
-    return res
 
+# v2 API: GET /jobs/{job_qrn}/program
+@app.get("/jobs/{job_id}/program")
+async def getJobProgram(
+    job_id: str = Path(...),
+    x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+):
+    """Retrieve the program of a quantum job (v2 API)."""
+    if x_api_key is None:
+        raise HTTPException(status_code=401, detail="API key is required")
 
-@app.get("/quantum-jobs/result/{job_id}")
-async def getJobResult(job_id: str, api_key: Optional[str] = Header(None, alias="api-key")):
-    """Retrieve the results of a quantum job."""
-    if api_key is None:
+    if job_id not in JOBS_MOCK_DB:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    job_data = JOBS_MOCK_DB[job_id]
+
+    # Return the stored program in v2 format: { success, data: { format, data } }
+    return {
+        "success": True,
+        "data": {
+            "format": job_data.get("program", {}).get("format", "qasm2"),
+            "data": job_data.get("program", {}).get("data", ""),
+        },
+    }
+
+
+# v2 API: GET /jobs/{job_qrn}/result
+@app.get("/jobs/{job_id}/result")
+async def getJobResult(
+    job_id: str = Path(...),
+    x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+):
+    """Retrieve the results of a quantum job (v2 API)."""
+    if x_api_key is None:
         raise HTTPException(status_code=401, detail="API key is required")
 
     if job_id not in JOBS_MOCK_DB:
@@ -215,20 +235,36 @@ async def getJobResult(job_id: str, api_key: Optional[str] = Header(None, alias=
         )
 
     if JOBS_MOCK_DB[job_id]["status"] != "COMPLETED":
+        # v2: use success=false instead of "error" field
         return {
-            "error": "Job still in progress. Results will be available once job is completed.",
-            "data": {},
+            "success": False,
+            "data": {"status": JOBS_MOCK_DB[job_id]["status"]},
         }
 
     if job_id not in JOBS_MOCK_RESULTS:
         raise HTTPException(status_code=500, detail="Job results not found")
 
     if random.random() < 0.2:
-        return {"error": "Failed to retrieve job results. Please wait, and try again.", "data": {}}
+        return {
+            "success": False,
+            "data": {
+                "status": "COMPLETED",
+                "message": "Failed to retrieve job results. Please wait, and try again.",
+            },
+        }
 
     counts = JOBS_MOCK_RESULTS[job_id]
 
-    return {"data": {"measurementCounts": counts}}
+    # v2 response: measurementCounts nested under data.resultData
+    return {
+        "success": True,
+        "data": {
+            "resultData": {"measurementCounts": counts},
+            "status": "COMPLETED",
+            "cost": 0,
+            "timeStamps": {},
+        },
+    }
 
 
 def startServer(port):
@@ -237,4 +273,4 @@ def startServer(port):
 
 
 if __name__ == "__main__":
-    startServer(62449)
+    startServer(62452)

From 41fe2486fc9777cd8941ec1d4f4df5da8a6cf389 Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Fri, 13 Mar 2026 13:34:02 +0530
Subject: [PATCH 03/85] fix: merge conflicts

---
 .github/workflows/integration_tests.yml       | 30 +++++------
 docs/sphinx/using/backends/cloud.rst          |  7 +--
 docs/sphinx/using/backends/cloud/qbraid.rst   |  1 -
 .../using/backends/hardware/iontrap.rst       | 50 +++++++++----------
 4 files changed, 37 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index ee781d737b5..508ed712532 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -10,9 +10,9 @@ on:
   workflow_dispatch:
     inputs:
       target:
-        description: 'Target (choose nightly to run like nightly tests)'
+        description: "Target (choose nightly to run like nightly tests)"
         required: true
-        default: 'nightly'
+        default: "nightly"
         type: choice
         options:
           - nightly
@@ -23,51 +23,47 @@ on:
           - iqm
           - oqc
           - orca
-<<<<<<< HEAD
           - pasqal
           - qci
           - quantinuum
           - scaleway
           - tii
-=======
-          - fermioniq
           - qbraid
->>>>>>> 17f25cf4 (qBraid integration MVP (#4))
       single_test_name:
         type: string
         required: false
-        description: 'Single test (e.g., targettests/quantinuum/load_value.cpp). Runs default tests if left blank'
+        description: "Single test (e.g., targettests/quantinuum/load_value.cpp). Runs default tests if left blank"
       target_machine:
         type: string
         required: false
-        description: 'Target machine (e.g., H2-1E).'
+        description: "Target machine (e.g., H2-1E)."
       cudaq_test_image:
         type: string
         required: false
-        default: '' # picked up from repo variable if not provided
-        description: 'CUDA Quantum image to run the tests in. Default to the latest CUDA Quantum nightly image'
+        default: "" # picked up from repo variable if not provided
+        description: "CUDA Quantum image to run the tests in. Default to the latest CUDA Quantum nightly image"
       commit_sha:
         type: string
         required: false
-        description: 'Commit SHA to pull the code (examples/tests) for testing. Default to the commit associated with the CUDA Quantum docker image if left blank'
+        description: "Commit SHA to pull the code (examples/tests) for testing. Default to the commit associated with the CUDA Quantum docker image if left blank"
       workflow_id:
         type: string
         required: false
-        description: 'Workflow Id to retrieve the Python wheel for testing. Default to the wheels produced by the Publishing workflow associated with the latest nightly CUDA Quantum Docker image if left blank'
+        description: "Workflow Id to retrieve the Python wheel for testing. Default to the wheels produced by the Publishing workflow associated with the latest nightly CUDA Quantum Docker image if left blank"
       python_version:
         type: choice
         required: true
-        description: 'Python version to run wheel test'
+        description: "Python version to run wheel test"
         options:
-        - '3.11'
-        - '3.12'
-        - '3.13'
+          - "3.11"
+          - "3.12"
+          - "3.13"
 
   schedule:
     - cron: 0 3 * * *
 
 env:
-  python_version: '3.12'
+  python_version: "3.12"
 
 jobs:
   # Run a daily check of all links in the docs to find any newly broken links
diff --git a/docs/sphinx/using/backends/cloud.rst b/docs/sphinx/using/backends/cloud.rst
index 2395dd6d3b1..d2044d64e9e 100644
--- a/docs/sphinx/using/backends/cloud.rst
+++ b/docs/sphinx/using/backends/cloud.rst
@@ -5,12 +5,7 @@ CUDA-Q provides a number of options to access hardware resources (GPUs and QPUs)
 
 .. toctree::
    :maxdepth: 1
-      
+
         Amazon Braket (braket) <cloud/braket.rst>
-<<<<<<< HEAD
         Scaleway QaaS (scaleway) <cloud/scaleway.rst>
-=======
-        NVIDIA Quantum Cloud (nvqc) <cloud/nvqc.rst>
         Qbraid <cloud/qbraid.rst>
-
->>>>>>> 17f25cf4 (qBraid integration MVP (#4))
diff --git a/docs/sphinx/using/backends/cloud/qbraid.rst b/docs/sphinx/using/backends/cloud/qbraid.rst
index 91184e6b934..a7e7fe4a2ae 100644
--- a/docs/sphinx/using/backends/cloud/qbraid.rst
+++ b/docs/sphinx/using/backends/cloud/qbraid.rst
@@ -59,4 +59,3 @@ Submission from C++
                 nvq++ --emulate --target qbraid src.cpp
 
         To see a complete example for using IonQ's backends, take a look at our :doc:`C++ examples <../../examples/examples>`.
-  
\ No newline at end of file
diff --git a/docs/sphinx/using/backends/hardware/iontrap.rst b/docs/sphinx/using/backends/hardware/iontrap.rst
index 3d5db2a90e4..0dc69de2177 100644
--- a/docs/sphinx/using/backends/hardware/iontrap.rst
+++ b/docs/sphinx/using/backends/hardware/iontrap.rst
@@ -31,7 +31,7 @@ Submitting
 
     By default, quantum kernel code will be submitted to the IonQ simulator.
 
-    .. note:: 
+    .. note::
 
        A "target" in :code:`cudaq` refers to a quantum compute provider, such as :code:`ionq`.
        However, IonQ's documentation uses the term "target" to refer to specific QPU's themselves.
@@ -70,7 +70,7 @@ Submitting
 
         This will take the API key and handle all authentication with, and submission to, the IonQ QPU(s). By default, quantum kernel code will be submitted to the IonQsimulator.
 
-        .. note:: 
+        .. note::
 
                 A "target" in :code:`cudaq` refers to a quantum compute provider, such as :code:`ionq`.
                 However, IonQ's documentation uses the term "target" to refer to specific QPU's themselves.
@@ -105,7 +105,7 @@ Setting Credentials
 ```````````````````
 
 Programmers of CUDA-Q may access the Quantinuum API from either
-C++ or Python. Quantinuum requires a credential configuration file. 
+C++ or Python. Quantinuum requires a credential configuration file.
 The configuration file can be generated as follows, replacing
 the ``email`` and ``credentials`` in the first line with your Quantinuum
 account details.
@@ -134,8 +134,8 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
 .. tab:: Python
 
-       
-        The backend to which quantum kernels are submitted 
+
+        The backend to which quantum kernels are submitted
         can be controlled with the ``cudaq.set_target()`` function.
 
         .. code:: python
@@ -155,15 +155,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
             cudaq.set_target('quantinuum', machine='H2-2')
 
         where ``H2-2`` is an example of a physical QPU. Hardware specific
-        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For 
-        access to the syntax checker for the provided machine, you may append an ``SC`` 
+        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For
+        access to the syntax checker for the provided machine, you may append an ``SC``
         to the end (e.g, ``H2-1SC``).
 
-        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__ 
+        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__
         and navigate to the "Profile" tab, where you should find a table titled "Quantinuum Systems Access".
 
         To emulate the Quantinuum machine locally, without submitting through the cloud,
-        you can set the ``emulate`` flag to ``True``. This will emit any target 
+        you can set the ``emulate`` flag to ``True``. This will emit any target
         specific compiler warnings and diagnostics, before running a noise free emulation.
         You do not need to specify project or machine when emulating.
 
@@ -175,7 +175,7 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
         the ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. By default,
         the ``shots_count`` is set to 1000.
 
-        .. code:: python 
+        .. code:: python
 
             cudaq.sample(kernel, shots_count=10000)
 
@@ -183,7 +183,7 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 .. tab:: C++
 
         To target quantum kernel code for execution in the Quantinuum backends,
-        pass the flag ``--target quantinuum`` to the ``nvq++`` compiler. CUDA-Q will 
+        pass the flag ``--target quantinuum`` to the ``nvq++`` compiler. CUDA-Q will
         authenticate via the Quantinuum REST API using the credential in your configuration file.
         By default, quantum kernel code will be submitted to the Quantinuum syntax checker.
         Submission to the syntax checker merely validates the program; the kernels are not executed.
@@ -202,15 +202,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
             nvq++ --target quantinuum --quantinuum-machine H2-2 src.cpp ...
 
         where ``H2-2`` is an example of a physical QPU. Hardware specific
-        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For 
-        access to the syntax checker for the provided machine, you may append an ``SC`` 
+        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For
+        access to the syntax checker for the provided machine, you may append an ``SC``
         to the end (e.g, ``H2-1SC``).
 
-        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__ 
+        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__
         and navigate to the "Profile" tab, where you should find a table titled "Quantinuum Systems Access".
 
         To emulate the Quantinuum machine locally, without submitting through the cloud,
-        you can pass the ``--emulate`` flag to ``nvq++``. This will emit any target 
+        you can pass the ``--emulate`` flag to ``nvq++``. This will emit any target
         specific compiler warnings and diagnostics, before running a noise free emulation.
         You do not need to specify project or machine when emulating.
 
@@ -218,16 +218,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
             nvq++ --emulate --target quantinuum src.cpp
 
-.. note:: 
+.. note::
 
-<<<<<<< HEAD
        Quantinuum's syntax checker for Helios (e.g., ``Helios-1SC``) only performs QIR code validation and does not return any results.
        Thus, it always returns an empty result set. This is different from other Quantinuum backends (e.g., ``H2-1SC``) where the syntax checker returns dummy results.
        As a result, when using the Helios syntax checker, we may receive this warning message:
 
         .. code:: text
-    
-                WARNING: this kernel invocation produced 0 shots worth of results when executed. 
+
+                WARNING: this kernel invocation produced 0 shots worth of results when executed.
 
         It means that the kernel was successfully validated, but no execution results are available.
         To get results, please submit to the Helios emulator (e.g., ``Helios-1E``) or the actual quantum device (e.g., ``Helios-1``).
@@ -236,16 +235,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
 To see a complete example, take a look at :ref:`Quantinuum examples <quantinuum-examples>`.
 
-.. note:: 
+.. note::
 
         In local emulation mode (``emulate`` flag set to ``True``), the program will be executed on the :ref:`default simulator <default-simulator>`.
-        The environment variable ``CUDAQ_DEFAULT_SIMULATOR`` can be used to change the emulation simulator. 
-        
+        The environment variable ``CUDAQ_DEFAULT_SIMULATOR`` can be used to change the emulation simulator.
+
         For example, the simulation floating point accuracy and/or the simulation capabilities (e.g., maximum number of qubits, supported quantum gates),
-        depend on the selected simulator.  
-        
+        depend on the selected simulator.
+
         Any environment variables must be set prior to setting the target or running "`import cudaq`".
-=======
 
 QBRAID
 +++++++
@@ -310,5 +308,3 @@ Submitting
                 nvq++ --emulate --target qbraid src.cpp
 
         To see a complete example for using IonQ's backends, take a look at our :doc:`C++ examples <../../examples/examples>`.
-  
->>>>>>> 17f25cf4 (qBraid integration MVP (#4))

From d74243dd59e94091acae92bedd96ae41332bde68 Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Fri, 13 Mar 2026 13:47:31 +0530
Subject: [PATCH 04/85] add: api_key and device to set_target for qbraid

---
 .../rest/helpers/qbraid/QbraidServerHelper.cpp  | 17 +++++++++++++++--
 .../default/rest/helpers/qbraid/qbraid.yml      |  9 +++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index 5e930c0f2da..3cd688ec579 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -20,11 +20,24 @@ class QbraidServerHelper : public ServerHelper {
 
     backendConfig.clear();
     backendConfig["url"] = getValueOrDefault(config, "url", DEFAULT_URL);
-    backendConfig["device_id"] = getValueOrDefault(config, "device_id", DEFAULT_DEVICE);
     backendConfig["user_agent"] = "cudaq/" + std::string(cudaq::getVersion());
     backendConfig["qubits"] = std::to_string(DEFAULT_QUBITS);
 
-    backendConfig["api_key"] = getEnvVar("QBRAID_API_KEY", "", true);
+    // Accept "machine" as a user-friendly alias for device_id
+    // Usage: cudaq.set_target("qbraid", machine="ionq:ionq:sim:simulator")
+    if (!config["machine"].empty()) {
+      backendConfig["device_id"] = config["machine"];
+    } else {
+      backendConfig["device_id"] = getValueOrDefault(config, "device_id", DEFAULT_DEVICE);
+    }
+
+    // Accept api_key from target arguments, fall back to QBRAID_API_KEY env var
+    // Usage: cudaq.set_target("qbraid", api_key="my-key")
+    if (!config["api_key"].empty()) {
+      backendConfig["api_key"] = config["api_key"];
+    } else {
+      backendConfig["api_key"] = getEnvVar("QBRAID_API_KEY", "", true);
+    }
     backendConfig["job_path"] = backendConfig["url"] + "/jobs";
 
     backendConfig["results_output_dir"] = getValueOrDefault(config, "results_output_dir", "./qbraid_results");
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
index 5132a74d1a7..99a0f17ee7d 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
@@ -26,5 +26,10 @@ target-arguments:
   - key: machine
     required: false
     type: string
-    platform-arg: qpu 
-    help-string: "Specify the qBraid QPU."
\ No newline at end of file
+    platform-arg: qpu
+    help-string: "Specify the qBraid QPU."
+  - key: api_key
+    required: false
+    type: string
+    platform-arg: api_key
+    help-string: "Specify the qBraid API key."

From 9cd62cffc6562aef4c425857cb97edf80c5a3407 Mon Sep 17 00:00:00 2001
From: Harshit <harshit.11235@gmail.com>
Date: Wed, 15 Apr 2026 10:19:50 +0000
Subject: [PATCH 05/85] fix: submodule hashes and v2 platform implementation
 and test

---
 lib/Optimizer/CodeGen/Passes.cpp              |   5 +-
 python/tests/backends/test_Qbraid.py          | 184 ++++++++++++++++++
 .../rest/helpers/qbraid/CMakeLists.txt        |   6 +-
 .../helpers/qbraid/QbraidServerHelper.cpp     | 139 +++++++------
 .../default/rest/helpers/qbraid/qbraid.yml    |   4 +-
 tpls/Stim                                     |   2 +-
 tpls/cpr                                      |   2 +-
 tpls/fmt                                      |   2 +-
 tpls/spdlog                                   |   2 +-
 unittests/backends/qbraid/CMakeLists.txt      |  21 +-
 .../qbraid/QbraidStartServerAndTest.sh.in     |   4 +
 unittests/backends/qbraid/QbraidTester.cpp    | 120 ++++++++----
 utils/mock_qpu/qbraid/__init__.py             |  37 +++-
 13 files changed, 397 insertions(+), 131 deletions(-)
 create mode 100644 python/tests/backends/test_Qbraid.py

diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp
index ce9795bf0c8..dc41359f93b 100644
--- a/lib/Optimizer/CodeGen/Passes.cpp
+++ b/lib/Optimizer/CodeGen/Passes.cpp
@@ -105,10 +105,9 @@ static void addQbraidPipeline(OpPassManager &pm) {
   std::string basis[] = {
       "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
   };
-  BasisConversionPassOptions options;
+  BasisConversionOptions options;
   options.basis = basis;
-  options.disabledPatterns = z_disabledPatterns;
-  pm.addPass(createBasisConversionPass(options));
+  pm.addPass(createBasisConversion(options));
 }
 
 void cudaq::opt::registerTargetPipelines() {
diff --git a/python/tests/backends/test_Qbraid.py b/python/tests/backends/test_Qbraid.py
new file mode 100644
index 00000000000..6b6d1599753
--- /dev/null
+++ b/python/tests/backends/test_Qbraid.py
@@ -0,0 +1,184 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import os
+from multiprocessing import Process
+
+import cudaq
+import pytest
+from cudaq import spin
+from network_utils import check_server_connection
+
+try:
+    from utils.mock_qpu.qbraid import startServer
+except ImportError:
+    print("Mock qpu not available, skipping qBraid tests.")
+    pytest.skip("Mock qpu not available.", allow_module_level=True)
+
+port = 62452
+
+# Default machine for tests. Mirrors the real qBraid device string format.
+TEST_MACHINE = "ionq:ionq:sim:simulator"
+TEST_API_KEY = "00000000000000000000000000000000"
+
+
+# The qbraid mock server in utils/mock_qpu/qbraid/__init__.py doesn't simulate
+# quantum mechanics - it only inspects the QASM for `h` and `measure` ops and
+# generates random outcomes for qubits with H. It does NOT model entanglement
+# via CNOT. Assertions below reflect the mock's behavior, not physical truth.
+
+
+def _set_qbraid_target(**overrides):
+    """Call set_target with the canonical qbraid args plus any overrides.
+
+    Uses the documented target arguments (`machine`, `api_key`) plus `url`
+    which is accepted by the helper for test/mock overrides.
+    """
+    kwargs = {
+        "url": f"http://localhost:{port}",
+        "machine": TEST_MACHINE,
+        "api_key": TEST_API_KEY,
+    }
+    kwargs.update(overrides)
+    cudaq.set_target("qbraid", **kwargs)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def startUpMockServer():
+    cudaq.set_random_seed(13)
+    os.environ["QBRAID_API_KEY"] = TEST_API_KEY
+
+    _set_qbraid_target()
+
+    p = Process(target=startServer, args=(port,))
+    p.start()
+
+    if not check_server_connection(port):
+        p.terminate()
+        pytest.exit("Mock server did not start in time, skipping tests.",
+                    returncode=1)
+
+    yield "Server started."
+
+    p.terminate()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def configureTarget():
+    _set_qbraid_target()
+    yield "Running the test."
+    cudaq.reset_target()
+
+
+def _make_h_kernel():
+    """H on q[0], CX to q[1], measure both. Mock only sees H on q[0]."""
+    kernel = cudaq.make_kernel()
+    qubits = kernel.qalloc(2)
+    kernel.h(qubits[0])
+    kernel.cx(qubits[0], qubits[1])
+    kernel.mz(qubits)
+    return kernel
+
+
+def test_qbraid_sample():
+    counts = cudaq.sample(_make_h_kernel())
+    # Mock: q[0] superposition -> {"0","1"}, q[1] fixed -> "0"
+    # Observed outcomes: "00" and "10"
+    assert len(counts) == 2
+    assert "00" in counts
+    assert "10" in counts
+
+
+def test_qbraid_sample_async():
+    future = cudaq.sample_async(_make_h_kernel())
+    counts = future.get()
+    assert len(counts) == 2
+    assert "00" in counts
+    assert "10" in counts
+
+
+def test_qbraid_sample_async_persist_future():
+    future = cudaq.sample_async(_make_h_kernel())
+    futureAsString = str(future)
+
+    readIn = cudaq.AsyncSampleResult(futureAsString)
+    counts = readIn.get()
+    assert len(counts) == 2
+    assert "00" in counts
+    assert "10" in counts
+
+
+def _make_vqe_ansatz():
+    kernel, theta = cudaq.make_kernel(float)
+    qreg = kernel.qalloc(2)
+    kernel.x(qreg[0])
+    kernel.ry(theta, qreg[1])
+    kernel.cx(qreg[1], qreg[0])
+    hamiltonian = (5.907 - 2.1433 * spin.x(0) * spin.x(1) -
+                   2.1433 * spin.y(0) * spin.y(1) + 0.21829 * spin.z(0) -
+                   6.125 * spin.z(1))
+    return kernel, hamiltonian
+
+
+def test_qbraid_observe():
+    kernel, hamiltonian = _make_vqe_ansatz()
+    res = cudaq.observe(kernel, hamiltonian, 0.59)
+    # Mock outcomes are random; just verify the roundtrip returned a finite value.
+    val = res.expectation()
+    assert isinstance(val, float)
+    assert val == val  # NaN check
+
+
+def test_qbraid_observe_async_persist_future():
+    kernel, hamiltonian = _make_vqe_ansatz()
+
+    future = cudaq.observe_async(kernel, hamiltonian, 0.59)
+    futureAsString = str(future)
+
+    readIn = cudaq.AsyncObserveResult(futureAsString, hamiltonian)
+    res = readIn.get()
+    val = res.expectation()
+    assert isinstance(val, float)
+    assert val == val
+
+
+def test_qbraid_api_key_via_target_arg_without_env_var():
+    """When QBRAID_API_KEY env var is absent, api_key kwarg must work."""
+    saved = os.environ.pop("QBRAID_API_KEY", None)
+    try:
+        _set_qbraid_target(api_key=TEST_API_KEY)
+
+        kernel = cudaq.make_kernel()
+        qubit = kernel.qalloc()
+        kernel.h(qubit)
+        kernel.mz(qubit)
+
+        counts = cudaq.sample(kernel)
+        assert len(counts) >= 1
+    finally:
+        if saved is not None:
+            os.environ["QBRAID_API_KEY"] = saved
+
+
+def test_qbraid_machine_alternative_device():
+    """A different machine string is accepted via the target arg."""
+    _set_qbraid_target(machine="aws:aws:sim:sv1")
+
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+
+    counts = cudaq.sample(kernel)
+    assert len(counts) >= 1
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-s"])
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
index 05b059ecd25..823c01fd100 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
@@ -10,8 +10,8 @@ add_target_config(qbraid)
 
 add_library(cudaq-serverhelper-qbraid SHARED QbraidServerHelper.cpp )
 target_link_libraries(cudaq-serverhelper-qbraid
-  PUBLIC 
-    cudaq-common 
-    fmt::fmt-header-only 
+  PUBLIC
+    cudaq-common
+    cudaq-logger
 )
 install(TARGETS cudaq-serverhelper-qbraid DESTINATION lib)
\ No newline at end of file
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index 3cd688ec579..8b26f8b3d45 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -1,8 +1,10 @@
-#include "common/Logger.h"
 #include "common/RestClient.h"
 #include "common/ServerHelper.h"
 #include "cudaq/Support/Version.h"
+#include "cudaq/runtime/logger/logger.h"
 #include "cudaq/utils/cudaq_utils.h"
+#include <map>
+#include <regex>
 #include <thread>
 
 namespace cudaq {
@@ -40,9 +42,6 @@ class QbraidServerHelper : public ServerHelper {
     }
     backendConfig["job_path"] = backendConfig["url"] + "/jobs";
 
-    backendConfig["results_output_dir"] = getValueOrDefault(config, "results_output_dir", "./qbraid_results");
-    backendConfig["results_file_prefix"] = getValueOrDefault(config, "results_file_prefix", "qbraid_job_");
-
     if (!config["shots"].empty()) {
       backendConfig["shots"] = config["shots"];
       this->setShots(std::stoul(config["shots"]));
@@ -57,10 +56,6 @@ class QbraidServerHelper : public ServerHelper {
     for (const auto &[key, value] : backendConfig) {
       cudaq::info("  {} = {}", key, value);
     }
-
-    std::string resultsDir = backendConfig["results_output_dir"];
-    std::filesystem::create_directories(resultsDir);
-    cudaq::info("Created results directory: {}", resultsDir);
   }
 
   ServerJobPayload
@@ -73,12 +68,13 @@ class QbraidServerHelper : public ServerHelper {
     for (auto &circuitCode : circuitCodes) {
       ServerMessage job;
       job["deviceQrn"] = backendConfig.at("device_id");
-      job["shots"] = std::stoi(backendConfig.at("shots"));
+      // Use the per-call shots (set via cudaq::sample(..., shots_count=N))
+      job["shots"] = shots;
 
       // v2 API: program is a structured object with format and data
       nlohmann::json program;
       program["format"] = "qasm2";
-      program["data"] = circuitCode.code;
+      program["data"] = normalizeClassicalRegisters(circuitCode.code);
       job["program"] = program;
 
       // v2 API: name is a top-level field (not nested under tags)
@@ -118,11 +114,6 @@ class QbraidServerHelper : public ServerHelper {
     return backendConfig.at("job_path") + "/" + jobId + "/result";
   }
 
-  std::string constructGetProgramPath(const std::string &jobId) {
-    // v2 API: /jobs/{jobQrn}/program
-    return backendConfig.at("job_path") + "/" + jobId + "/program";
-  }
-
   bool jobIsDone(ServerMessage &getJobResponse) override {
     std::string status;
 
@@ -140,37 +131,25 @@ class QbraidServerHelper : public ServerHelper {
     }
 
     if (status == "FAILED" || status == "COMPLETED" || status == "CANCELLED") {
-      saveResponseToFile(getJobResponse);
       return true;
     }
 
     return false;
   }
 
-  // Fetch the original program from v2 endpoint
-  std::string getJobProgram(const ServerMessage &response, const std::string &jobId) override {
-    auto programPath = constructGetProgramPath(jobId);
-    auto headers = getHeaders();
-
-    cudaq::info("Fetching job program from v2 endpoint: {}", programPath);
-    RestClient client;
-    auto programJson = client.get("", programPath, headers, true);
-
-    // v2 API: program content at data.data, format at data.format
-    if (programJson.contains("data") && programJson["data"].contains("data")) {
-      cudaq::info("Retrieved program (format: {})",
-                  programJson["data"].value("format", "unknown"));
-      return programJson["data"]["data"].get<std::string>();
-    }
-
-    throw std::runtime_error("Invalid program response format: " + programJson.dump());
-  }
-
-  // Fetch results from v2 results endpoint with retry logic
+  // Fetch results from v2 results endpoint with retry logic.
+  //
+  // Rationale: qbraid's v2 API has a window where status transitions to
+  // COMPLETED before the result payload is queryable on /result, so /result
+  // returns {success: false, data: {message: "not yet available"}}. The retry
+  // with backoff absorbs that race.
+  //
+  // Exercised deterministically via the mock's POST /test/delay_next_results
+  // endpoint (see checkResultRetry / checkResultRetryExhaustion tests).
   cudaq::sample_result processResults(ServerMessage &getJobResponse, std::string &jobId) override {
-    int maxRetries = 5;
-    int waitTime = 2;
-    float backoffFactor = 2.0;
+    const int maxRetries = 3;
+    const int waitTime = 2;
+    const float backoffFactor = 2.0;
 
     for (int attempt = 0; attempt < maxRetries; ++attempt) {
       try {
@@ -242,34 +221,68 @@ class QbraidServerHelper : public ServerHelper {
   }
 
 private:
-  void saveResponseToFile(const ServerMessage &response, const std::string &identifier = "") {
-    try {
-      std::string outputDir = backendConfig.at("results_output_dir");
-      std::string filePrefix = backendConfig.at("results_file_prefix");
-
-      // Create a unique filename using timestamp if no identifier is provided
-      std::string filename;
-      if (identifier.empty()) {
-        auto now = std::chrono::system_clock::now();
-        auto timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
-        filename = outputDir + "/" + filePrefix + std::to_string(timestamp) + ".json";
-      } else {
-        filename = outputDir + "/" + filePrefix + identifier + ".json";
-      }
+  // Merge multiple single-bit classical registers emitted by nvq++'s QASM 2
+  // codegen into a single multi-bit `creg c[N]`. This is required to unblock
+  // qBraid-routed hardware backends.
+  //
+  // Context: nvq++ emits one `creg varK[1];` per measurement. AWS Braket's
+  // classical simulators (SV1, DM1, TN1) tolerate that via lenient register
+  // concatenation, but stricter hardware transpilers reject it:
+  //   - IQM (Garnet etc.): returns only the first register -> 1-bit results
+  //   - Rigetti: collapses all registers onto b[0] -> "bit already in use"
+  //   - IonQ-via-Braket: similar strict behavior
+  // Normalizing to a single register is the canonical QASM 2 form and is
+  // accepted uniformly by every qBraid-reachable backend.
+  std::string normalizeClassicalRegisters(const std::string &qasm) const {
+    static const std::regex cregDeclRx(
+        R"(creg\s+(\w+)\s*\[\s*(\d+)\s*\]\s*;)");
+
+    std::vector<std::pair<std::string, int>> cregs;
+    for (auto it = std::sregex_iterator(qasm.begin(), qasm.end(), cregDeclRx);
+         it != std::sregex_iterator(); ++it) {
+      cregs.emplace_back((*it)[1].str(), std::stoi((*it)[2].str()));
+    }
 
-      std::ofstream outputFile(filename);
-      if (!outputFile.is_open()) {
-        cudaq::info("Failed to open file for writing: {}", filename);
-        return;
-      }
+    // Nothing to do if the QASM already has a single classical register.
+    if (cregs.size() <= 1)
+      return qasm;
+
+    std::map<std::string, int> offsetByName;
+    int totalBits = 0;
+    for (auto &[name, size] : cregs) {
+      offsetByName[name] = totalBits;
+      totalBits += size;
+    }
 
-      outputFile << response.dump(2);
-      outputFile.close();
+    std::string out = qasm;
 
-      cudaq::info("Response saved to file: {}", filename);
-    } catch (const std::exception &e) {
-      cudaq::info("Error saving response to file: {}", e.what());
+    // Rewrite every `-> NAME[i]` target BEFORE we mutate the creg declarations.
+    for (auto &[name, size] : cregs) {
+      int base = offsetByName[name];
+      for (int i = 0; i < size; ++i) {
+        std::regex measureTargetRx("->\\s*" + name + "\\s*\\[\\s*" +
+                                   std::to_string(i) + "\\s*\\]");
+        out = std::regex_replace(out, measureTargetRx,
+                                 "-> qbraid__creg__[" + std::to_string(base + i) + "]");
+      }
+    }
+
+    // Replace the first declaration with the merged register.
+    out = std::regex_replace(out, cregDeclRx,
+                             "creg qbraid__creg__[" +
+                                 std::to_string(totalBits) + "];",
+                             std::regex_constants::format_first_only);
+
+    // Remove the remaining original declarations.
+    for (size_t i = 1; i < cregs.size(); ++i) {
+      std::regex toRemove("creg\\s+" + cregs[i].first +
+                          "\\s*\\[\\s*\\d+\\s*\\]\\s*;\\s*");
+      out = std::regex_replace(out, toRemove, "");
     }
+
+    cudaq::info("Normalized {} classical registers into single qbraid__creg__[{}]",
+                cregs.size(), totalBits);
+    return out;
   }
 
   RestHeaders getHeaders() override {
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
index 99a0f17ee7d..0ee345afd43 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
@@ -15,8 +15,8 @@ config:
   gen-target-backend: true
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
-  # Define the lowering pipeline
-  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,classical-optimization-pipeline,func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,CCZToCX,CRyToCX,CRxToCX,R1AdjToR1,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(memtoreg{quantum=0}),symbol-dce"
+  # Define the JIT lowering pipeline
+  jit-mid-level-pipeline: "qbraid-gate-set-mapping"
   # Tell the rest-qpu that we are generating OpenQASM.
   codegen-emission: qasm2
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/tpls/Stim b/tpls/Stim
index 47190f4a3af..42e0b9e0991 160000
--- a/tpls/Stim
+++ b/tpls/Stim
@@ -1 +1 @@
-Subproject commit 47190f4a3afb104c9f0068d0be9fea87d2894a70
+Subproject commit 42e0b9e099180e8570407c33f87b4683cac00d81
diff --git a/tpls/cpr b/tpls/cpr
index 871ed52d350..d202b82fbcc 160000
--- a/tpls/cpr
+++ b/tpls/cpr
@@ -1 +1 @@
-Subproject commit 871ed52d350214a034f6ef8a3b8f51c5ce1bd400
+Subproject commit d202b82fbccf897604a18e035c09e1330dffd082
diff --git a/tpls/fmt b/tpls/fmt
index ba50c19e827..fc8d07cfe54 160000
--- a/tpls/fmt
+++ b/tpls/fmt
@@ -1 +1 @@
-Subproject commit ba50c19e827383bd5dacb74189fb4852c8dcbdae
+Subproject commit fc8d07cfe54ba9f5019453dfdb112491246ee017
diff --git a/tpls/spdlog b/tpls/spdlog
index edc51df1bda..287333ee005 160000
--- a/tpls/spdlog
+++ b/tpls/spdlog
@@ -1 +1 @@
-Subproject commit edc51df1bdad8667b628999394a1e7c4dc6f3658
+Subproject commit 287333ee00555aaece5a5cf6acc9040563c6f642
diff --git a/unittests/backends/qbraid/CMakeLists.txt b/unittests/backends/qbraid/CMakeLists.txt
index 05ca3c19550..390d20cc896 100644
--- a/unittests/backends/qbraid/CMakeLists.txt
+++ b/unittests/backends/qbraid/CMakeLists.txt
@@ -6,22 +6,11 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-add_executable(test_qbraid QbraidTester.cpp)
-if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
-  target_link_options(test_qbraid PRIVATE -Wl,--no-as-needed)
-endif()
-target_compile_definitions(test_qbraid PRIVATE -DNVQIR_BACKEND_NAME=qbraid)
-target_include_directories(test_qbraid PRIVATE ../..)
-target_link_libraries(test_qbraid
-  PRIVATE fmt::fmt-header-only
-  cudaq-common
-  cudaq
-  cudaq-builder
-  cudaq-mlir-runtime
-  cudaq-rest-qpu
-  cudaq-platform-default
-  gtest_main)
-
+add_backend_unittest_executable(test_qbraid
+  SOURCES QbraidTester.cpp
+  BACKEND qbraid
+  BACKEND_CONFIG "qbraid emulate=false url=http://localhost:62452 api_key=00000000000000000000000000000000"
+)
 
 configure_file("QbraidStartServerAndTest.sh.in" "${CMAKE_BINARY_DIR}/unittests/backends/qbraid/QbraidStartServerAndTest.sh" @ONLY)
 add_test(NAME qbraid-tests COMMAND bash QbraidStartServerAndTest.sh WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/unittests/backends/qbraid/)
diff --git a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
index 72ec44e9433..bd5c15b9af7 100644
--- a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
+++ b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
@@ -33,6 +33,10 @@ while ! checkServerConnection; do
     exit 99
   fi
 done
+# api_key is passed via the backend config (see CMakeLists BACKEND_CONFIG),
+# so we unset QBRAID_API_KEY to force the helper to use the config value.
+# checkApiKeyFromTarget asserts the env var is null.
+unset QBRAID_API_KEY
 # Run the tests
 ./test_qbraid
 # Did they fail?
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index e3e94fb8087..0d5b1fb1b09 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -8,6 +8,7 @@
 
 #include "CUDAQTestUtils.h"
 #include "common/FmtCore.h"
+#include "common/RestClient.h"
 #include "cudaq/algorithm.h"
 #include <fstream>
 #include <gtest/gtest.h>
@@ -19,17 +20,15 @@ std::string backendStringTemplate =
     "qbraid;emulate;false;url;http://localhost:{}";
 
 bool isValidExpVal(double value) {
-  // give us some wiggle room while keep the tests fast
-  return value < -1.1 && value > -2.3;
+  // The qbraid mock server doesn't simulate quantum mechanics - X0X1 counts
+  // are uniform random per 1000-shot sample (std dev ~0.03), so the
+  // expectation value for this VQE Hamiltonian fluctuates around -2.14 by
+  // a few hundredths per run. The band below is wide enough (~10 sigma) to
+  // be stable across test runs while still catching corrupt / NaN results.
+  return value < -1.0 && value > -3.0;
 }
 
 CUDAQ_TEST(QbraidTester, checkSampleSync) {
-  auto backendString =
-      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
-
-  auto &platform = cudaq::get_platform();
-  platform.setTargetBackend(backendString);
-
   auto kernel = cudaq::make_kernel();
   auto qubit = kernel.qalloc(2);
   kernel.h(qubit[0]);
@@ -41,12 +40,6 @@ CUDAQ_TEST(QbraidTester, checkSampleSync) {
 }
 
 CUDAQ_TEST(QbraidTester, checkSampleAsync) {
-  auto backendString =
-      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
-
-  auto &platform = cudaq::get_platform();
-  platform.setTargetBackend(backendString);
-
   auto kernel = cudaq::make_kernel();
   auto qubit = kernel.qalloc(2);
   kernel.h(qubit[0]);
@@ -58,12 +51,6 @@ CUDAQ_TEST(QbraidTester, checkSampleAsync) {
 }
 
 CUDAQ_TEST(QbraidTester, checkSampleAsyncLoadFromFile) {
-  auto backendString =
-      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
-
-  auto &platform = cudaq::get_platform();
-  platform.setTargetBackend(backendString);
-
   auto kernel = cudaq::make_kernel();
   auto qubit = kernel.qalloc(2);
   kernel.h(qubit[0]);
@@ -86,12 +73,6 @@ CUDAQ_TEST(QbraidTester, checkSampleAsyncLoadFromFile) {
 }
 
 CUDAQ_TEST(QbraidTester, checkObserveSync) {
-  auto backendString =
-      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
-
-  auto &platform = cudaq::get_platform();
-  platform.setTargetBackend(backendString);
-
   auto [kernel, theta] = cudaq::make_kernel<double>();
   auto qubit = kernel.qalloc(2);
   kernel.x(qubit[0]);
@@ -109,12 +90,6 @@ CUDAQ_TEST(QbraidTester, checkObserveSync) {
 }
 
 CUDAQ_TEST(QbraidTester, checkObserveAsync) {
-  auto backendString =
-      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
-
-  auto &platform = cudaq::get_platform();
-  platform.setTargetBackend(backendString);
-
   auto [kernel, theta] = cudaq::make_kernel<double>();
   auto qubit = kernel.qalloc(2);
   kernel.x(qubit[0]);
@@ -134,12 +109,6 @@ CUDAQ_TEST(QbraidTester, checkObserveAsync) {
 }
 
 CUDAQ_TEST(QbraidTester, checkObserveAsyncLoadFromFile) {
-  auto backendString =
-      fmt::format(fmt::runtime(backendStringTemplate), mockPort);
-
-  auto &platform = cudaq::get_platform();
-  platform.setTargetBackend(backendString);
-
   auto [kernel, theta] = cudaq::make_kernel<double>();
   auto qubit = kernel.qalloc(2);
   kernel.x(qubit[0]);
@@ -169,8 +138,81 @@ CUDAQ_TEST(QbraidTester, checkObserveAsyncLoadFromFile) {
   EXPECT_TRUE(isValidExpVal(result.expectation()));
 }
 
+// Every test in this file runs through the backend configured by
+// add_backend_unittest_executable in CMakeLists, which passes api_key via the
+// target config (BACKEND_CONFIG). QBRAID_API_KEY env var is NOT set by the
+// launch script, so a successful sample here exercises the target-arg path.
+CUDAQ_TEST(QbraidTester, checkApiKeyFromTarget) {
+  ASSERT_EQ(std::getenv("QBRAID_API_KEY"), nullptr)
+      << "QBRAID_API_KEY should not be set; this test verifies the "
+         "api_key=... target-arg path.";
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto counts = cudaq::sample(kernel);
+  EXPECT_GE(counts.size(), 1u);
+}
+
+CUDAQ_TEST(QbraidTester, checkJobFailure) {
+  // Arm the mock to fail the next submitted job.
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  auto armed = client.post("http://localhost:62452/", "test/fail_next", body,
+                           headers, /*enableLogging=*/false);
+  ASSERT_TRUE(armed.value("armed", false));
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  EXPECT_ANY_THROW({ (void)cudaq::sample(kernel); });
+}
+
+// Arm the mock to make the next N /result calls return "not yet available",
+// so processResults must retry. maxRetries is 3, so 2 delays should succeed.
+CUDAQ_TEST(QbraidTester, checkResultRetry) {
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  auto armed =
+      client.post("http://localhost:62452/", "test/delay_next_results/2", body,
+                  headers, /*enableLogging=*/false);
+  ASSERT_EQ(armed.value("remaining", -1), 2);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  auto counts = cudaq::sample(kernel);
+  EXPECT_GE(counts.size(), 1u);
+}
+
+// Arm enough delays to exhaust the retry budget (maxRetries = 3). Sample must
+// throw. Uses 10 so the retry loop can never succeed.
+CUDAQ_TEST(QbraidTester, checkResultRetryExhaustion) {
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  auto armed =
+      client.post("http://localhost:62452/", "test/delay_next_results/10", body,
+                  headers, /*enableLogging=*/false);
+  ASSERT_EQ(armed.value("remaining", -1), 10);
+
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+
+  EXPECT_ANY_THROW({ (void)cudaq::sample(kernel); });
+}
+
 int main(int argc, char **argv) {
-  setenv("QBRAID_API_KEY", "00000000000000000000000000000000", 0);
   ::testing::InitGoogleTest(&argc, argv);
   auto ret = RUN_ALL_TESTS();
   return ret;
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
index 1bb225a59f7..70686afd7c3 100644
--- a/utils/mock_qpu/qbraid/__init__.py
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -38,6 +38,13 @@ class Job(BaseModel):
 
 JOBS_MOCK_DB = {}
 JOBS_MOCK_RESULTS = {}
+# Testing toggle: when True, the next job submitted via POST /jobs is created
+# with status FAILED. Consumed (reset to False) after use.
+FAIL_NEXT_JOB = {"enabled": False}
+# Testing counter: how many upcoming GET /jobs/{id}/result calls should return
+# success=false (simulating the qbraid v2 race where status=COMPLETED before
+# results are queryable). Decrements on each /result call until 0.
+DELAY_RESULTS_COUNT = {"remaining": 0}
 
 
 def count_qubits(qasm: str) -> int:
@@ -163,6 +170,17 @@ async def postJob(job: Job, x_api_key: Optional[str] = Header(None, alias="X-API
 
     newId = str(uuid.uuid4())
 
+    # Test hook: fail this job immediately if the toggle was armed.
+    if FAIL_NEXT_JOB["enabled"]:
+        FAIL_NEXT_JOB["enabled"] = False
+        job_data = {
+            "status": "FAILED",
+            "statusText": "Triggered failure for testing",
+            **job.model_dump(),
+        }
+        JOBS_MOCK_DB[newId] = job_data
+        return {"success": True, "data": {"jobQrn": newId, "status": "FAILED"}}
+
     # Extract QASM from the structured program payload
     counts = simulate_job(job.program.data, job.shots)
 
@@ -175,6 +193,20 @@ async def postJob(job: Job, x_api_key: Optional[str] = Header(None, alias="X-API
     return {"success": True, "data": {"jobQrn": newId, "status": "INITIALIZING"}}
 
 
+# Test-only: arm a failure for the next submitted job.
+@app.post("/test/fail_next")
+async def armFailNext():
+    FAIL_NEXT_JOB["enabled"] = True
+    return {"armed": True}
+
+
+# Test-only: force the next N /result calls to return success=false.
+@app.post("/test/delay_next_results/{count}")
+async def armDelayResults(count: int = Path(...)):
+    DELAY_RESULTS_COUNT["remaining"] = count
+    return {"remaining": count}
+
+
 # v2 API: GET /jobs/{job_qrn}
 @app.get("/jobs/{job_id}")
 async def getJob(
@@ -244,7 +276,10 @@ async def getJobResult(
     if job_id not in JOBS_MOCK_RESULTS:
         raise HTTPException(status_code=500, detail="Job results not found")
 
-    if random.random() < 0.2:
+    # Test hook: return "not yet available" for the next N /result calls if
+    # the delay counter is armed. Decrements on each call.
+    if DELAY_RESULTS_COUNT["remaining"] > 0:
+        DELAY_RESULTS_COUNT["remaining"] -= 1
         return {
             "success": False,
             "data": {

From 3b0a1e4c84bab1378b4b37b279060143794cb21f Mon Sep 17 00:00:00 2001
From: Harshit <harshit.11235@gmail.com>
Date: Wed, 15 Apr 2026 10:41:29 +0000
Subject: [PATCH 06/85] fix: formatting and headers

---
 .github/pre-commit/spelling_allowlist.txt     | 10 +++
 docs/sphinx/targets/cpp/qbraid.cpp            |  1 -
 docs/sphinx/targets/python/qbraid.py          |  3 +-
 lib/Optimizer/CodeGen/Passes.cpp              |  4 +-
 .../helpers/qbraid/QbraidServerHelper.cpp     | 84 +++++++++++++------
 unittests/backends/qbraid/QbraidTester.cpp    |  2 +-
 utils/mock_qpu/qbraid/__init__.py             | 59 +++++++------
 7 files changed, 104 insertions(+), 59 deletions(-)

diff --git a/.github/pre-commit/spelling_allowlist.txt b/.github/pre-commit/spelling_allowlist.txt
index 64c9c045bed..984c5e929c8 100644
--- a/.github/pre-commit/spelling_allowlist.txt
+++ b/.github/pre-commit/spelling_allowlist.txt
@@ -1,6 +1,7 @@
 ABI
 AFQMC
 API
+api
 APIs
 AST
 Aer
@@ -108,6 +109,8 @@ Photonics
 PyPI
 Pygments
 QAOA
+QASM
+QBRAID
 QCI
 QCaaS
 QEC
@@ -122,6 +125,7 @@ QRMI
 QTX
 QX
 QaaS
+Qbraid
 Qiskit
 QuEra
 QuTiP
@@ -300,6 +304,7 @@ lossy
 lvalue
 macOS
 makefiles
+measurementCounts
 merchantability
 mps
 multinomial
@@ -311,6 +316,7 @@ natively
 normalization
 nullary
 nvcc
+nvq
 observables
 optimizer
 optimizers
@@ -333,20 +339,24 @@ probability
 programmatically
 pybind
 qaoa
+qbraid
 qed
 qio
+qrn
 quantize
 quantized
 qubit
 qubits
 qudit
 qudits
+queryable
 qumode
 qumodes
 reStructuredText
 realtime
 reconfigurable
 reproducibility
+resultData
 reusability
 runtime
 runtimes
diff --git a/docs/sphinx/targets/cpp/qbraid.cpp b/docs/sphinx/targets/cpp/qbraid.cpp
index 4b696005582..b6d859ec2de 100644
--- a/docs/sphinx/targets/cpp/qbraid.cpp
+++ b/docs/sphinx/targets/cpp/qbraid.cpp
@@ -4,7 +4,6 @@
 // ```
 // This will submit the job to the Qbraid ideal simulator target (default).
 
-
 #include <cudaq.h>
 #include <fstream>
 
diff --git a/docs/sphinx/targets/python/qbraid.py b/docs/sphinx/targets/python/qbraid.py
index 8450e3a6fd8..cf3fe483c6b 100644
--- a/docs/sphinx/targets/python/qbraid.py
+++ b/docs/sphinx/targets/python/qbraid.py
@@ -15,7 +15,6 @@ def kernel():
     x.ctrl(qvector[0], qvector[1])
 
 
-
 # Execute on Qbraid and print out the results.
 
 # Option A:
@@ -49,4 +48,4 @@ def kernel():
 # any remaining classical code in the file will occur only
 # after the job has been returned from Qbraid.
 counts = cudaq.sample(kernel)
-print(counts)
\ No newline at end of file
+print(counts)
diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp
index dc41359f93b..808db9e3e2a 100644
--- a/lib/Optimizer/CodeGen/Passes.cpp
+++ b/lib/Optimizer/CodeGen/Passes.cpp
@@ -136,8 +136,8 @@ void cudaq::opt::registerTargetPipelines() {
                              "Convert kernels to Fermioniq gate set.",
                              addFermioniqPipeline);
   PassPipelineRegistration<>("qbraid-gate-set-mapping",
-                              "Convert kernels to qBraid gate set.",
-                              addQbraidPipeline);
+                             "Convert kernels to qBraid gate set.",
+                             addQbraidPipeline);
 }
 
 void cudaq::opt::registerCodeGenDialect(DialectRegistry &registry) {
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index 8b26f8b3d45..b53979fd98e 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -1,3 +1,11 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
 #include "common/RestClient.h"
 #include "common/ServerHelper.h"
 #include "cudaq/Support/Version.h"
@@ -30,7 +38,8 @@ class QbraidServerHelper : public ServerHelper {
     if (!config["machine"].empty()) {
       backendConfig["device_id"] = config["machine"];
     } else {
-      backendConfig["device_id"] = getValueOrDefault(config, "device_id", DEFAULT_DEVICE);
+      backendConfig["device_id"] =
+          getValueOrDefault(config, "device_id", DEFAULT_DEVICE);
     }
 
     // Accept api_key from target arguments, fall back to QBRAID_API_KEY env var
@@ -61,7 +70,8 @@ class QbraidServerHelper : public ServerHelper {
   ServerJobPayload
   createJob(std::vector<KernelExecution> &circuitCodes) override {
     if (backendConfig.find("job_path") == backendConfig.end()) {
-      throw std::runtime_error("job_path not found in config. Was initialize() called?");
+      throw std::runtime_error(
+          "job_path not found in config. Was initialize() called?");
     }
 
     std::vector<ServerMessage> jobs;
@@ -90,18 +100,23 @@ class QbraidServerHelper : public ServerHelper {
 
   std::string extractJobId(ServerMessage &postResponse) override {
     // v2 API: jobQrn is nested under data envelope
-    if (postResponse.contains("data") && postResponse["data"].contains("jobQrn")) {
+    if (postResponse.contains("data") &&
+        postResponse["data"].contains("jobQrn")) {
       return postResponse["data"]["jobQrn"].get<std::string>();
     }
-    throw std::runtime_error("ServerMessage doesn't contain 'data.jobQrn' key.");
+    throw std::runtime_error(
+        "ServerMessage doesn't contain 'data.jobQrn' key.");
   }
 
   std::string constructGetJobPath(ServerMessage &postResponse) override {
     // v2 API: use path parameter instead of query parameter
-    if (postResponse.contains("data") && postResponse["data"].contains("jobQrn")) {
-      return backendConfig.at("job_path") + "/" + postResponse["data"]["jobQrn"].get<std::string>();
+    if (postResponse.contains("data") &&
+        postResponse["data"].contains("jobQrn")) {
+      return backendConfig.at("job_path") + "/" +
+             postResponse["data"]["jobQrn"].get<std::string>();
     }
-    throw std::runtime_error("ServerMessage doesn't contain 'data.jobQrn' key.");
+    throw std::runtime_error(
+        "ServerMessage doesn't contain 'data.jobQrn' key.");
   }
 
   std::string constructGetJobPath(std::string &jobId) override {
@@ -118,7 +133,8 @@ class QbraidServerHelper : public ServerHelper {
     std::string status;
 
     // v2 API: status is nested under data envelope
-    if (getJobResponse.contains("data") && getJobResponse["data"].contains("status")) {
+    if (getJobResponse.contains("data") &&
+        getJobResponse["data"].contains("status")) {
       status = getJobResponse["data"]["status"].get<std::string>();
       cudaq::info("Job status from v2 data envelope: {}", status);
     } else if (getJobResponse.contains("status")) {
@@ -146,7 +162,8 @@ class QbraidServerHelper : public ServerHelper {
   //
   // Exercised deterministically via the mock's POST /test/delay_next_results
   // endpoint (see checkResultRetry / checkResultRetryExhaustion tests).
-  cudaq::sample_result processResults(ServerMessage &getJobResponse, std::string &jobId) override {
+  cudaq::sample_result processResults(ServerMessage &getJobResponse,
+                                      std::string &jobId) override {
     const int maxRetries = 3;
     const int waitTime = 2;
     const float backoffFactor = 2.0;
@@ -156,15 +173,18 @@ class QbraidServerHelper : public ServerHelper {
         auto resultsPath = constructGetResultsPath(jobId);
         auto headers = getHeaders();
 
-        cudaq::info("Fetching results from v2 endpoint (attempt {}/{}): {}", attempt + 1, maxRetries, resultsPath);
+        cudaq::info("Fetching results from v2 endpoint (attempt {}/{}): {}",
+                    attempt + 1, maxRetries, resultsPath);
         RestClient client;
         auto resultJson = client.get("", resultsPath, headers, true);
 
         // v2 API: error indicated by success=false
-        if (resultJson.contains("success") && resultJson["success"].is_boolean()
-            && !resultJson["success"].get<bool>()) {
+        if (resultJson.contains("success") &&
+            resultJson["success"].is_boolean() &&
+            !resultJson["success"].get<bool>()) {
           std::string errorMsg = "Results not yet available";
-          if (resultJson.contains("data") && resultJson["data"].contains("message")) {
+          if (resultJson.contains("data") &&
+              resultJson["data"].contains("message")) {
             errorMsg = resultJson["data"]["message"].get<std::string>();
           }
           cudaq::info("Results endpoint returned success=false: {}", errorMsg);
@@ -174,12 +194,14 @@ class QbraidServerHelper : public ServerHelper {
           }
         }
         // v2 API: measurementCounts nested under data.resultData
-        else if (resultJson.contains("data")
-                 && resultJson["data"].contains("resultData")
-                 && resultJson["data"]["resultData"].contains("measurementCounts")) {
+        else if (resultJson.contains("data") &&
+                 resultJson["data"].contains("resultData") &&
+                 resultJson["data"]["resultData"].contains(
+                     "measurementCounts")) {
           cudaq::info("Processing results from v2 endpoint");
           CountsDictionary counts;
-          auto &measurements = resultJson["data"]["resultData"]["measurementCounts"];
+          auto &measurements =
+              resultJson["data"]["resultData"]["measurementCounts"];
 
           for (const auto &[bitstring, count] : measurements.items()) {
             counts[bitstring] =
@@ -195,15 +217,20 @@ class QbraidServerHelper : public ServerHelper {
 
         // No valid data yet and no explicit error - retry
         if (attempt < maxRetries - 1) {
-          int sleepTime = (attempt == 0) ? waitTime : waitTime * std::pow(backoffFactor, attempt);
-          cudaq::info("No valid results yet, retrying in {} seconds", sleepTime);
+          int sleepTime = (attempt == 0)
+                              ? waitTime
+                              : waitTime * std::pow(backoffFactor, attempt);
+          cudaq::info("No valid results yet, retrying in {} seconds",
+                      sleepTime);
           std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
         }
 
       } catch (const std::exception &e) {
         cudaq::info("Exception when fetching results: {}", e.what());
         if (attempt < maxRetries - 1) {
-          int sleepTime = (attempt == 0) ? waitTime : waitTime * std::pow(backoffFactor, attempt);
+          int sleepTime = (attempt == 0)
+                              ? waitTime
+                              : waitTime * std::pow(backoffFactor, attempt);
           cudaq::info("Retrying in {} seconds", sleepTime);
           std::this_thread::sleep_for(std::chrono::seconds(sleepTime));
         }
@@ -234,8 +261,7 @@ class QbraidServerHelper : public ServerHelper {
   // Normalizing to a single register is the canonical QASM 2 form and is
   // accepted uniformly by every qBraid-reachable backend.
   std::string normalizeClassicalRegisters(const std::string &qasm) const {
-    static const std::regex cregDeclRx(
-        R"(creg\s+(\w+)\s*\[\s*(\d+)\s*\]\s*;)");
+    static const std::regex cregDeclRx(R"(creg\s+(\w+)\s*\[\s*(\d+)\s*\]\s*;)");
 
     std::vector<std::pair<std::string, int>> cregs;
     for (auto it = std::sregex_iterator(qasm.begin(), qasm.end(), cregDeclRx);
@@ -263,7 +289,8 @@ class QbraidServerHelper : public ServerHelper {
         std::regex measureTargetRx("->\\s*" + name + "\\s*\\[\\s*" +
                                    std::to_string(i) + "\\s*\\]");
         out = std::regex_replace(out, measureTargetRx,
-                                 "-> qbraid__creg__[" + std::to_string(base + i) + "]");
+                                 "-> qbraid__creg__[" +
+                                     std::to_string(base + i) + "]");
       }
     }
 
@@ -280,14 +307,16 @@ class QbraidServerHelper : public ServerHelper {
       out = std::regex_replace(out, toRemove, "");
     }
 
-    cudaq::info("Normalized {} classical registers into single qbraid__creg__[{}]",
-                cregs.size(), totalBits);
+    cudaq::info(
+        "Normalized {} classical registers into single qbraid__creg__[{}]",
+        cregs.size(), totalBits);
     return out;
   }
 
   RestHeaders getHeaders() override {
     if (backendConfig.find("api_key") == backendConfig.end()) {
-      throw std::runtime_error("API key not found in config. Was initialize() called?");
+      throw std::runtime_error(
+          "API key not found in config. Was initialize() called?");
     }
 
     RestHeaders headers;
@@ -297,7 +326,8 @@ class QbraidServerHelper : public ServerHelper {
     return headers;
   }
 
-  std::string getEnvVar(const std::string &key, const std::string &defaultVal, const bool isRequired) const {
+  std::string getEnvVar(const std::string &key, const std::string &defaultVal,
+                        const bool isRequired) const {
     const char *env_var = std::getenv(key.c_str());
     if (env_var == nullptr) {
       if (isRequired) {
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index 0d5b1fb1b09..7580ab62750 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -216,4 +216,4 @@ int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   auto ret = RUN_ALL_TESTS();
   return ret;
-}
\ No newline at end of file
+}
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
index 70686afd7c3..a98dabd9b10 100644
--- a/utils/mock_qpu/qbraid/__init__.py
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -113,23 +113,17 @@ def simulate_job(qasm: str, num_shots: int) -> dict[str, int]:
     distribution = random.choices(possible_states, k=num_shots)
     result = {state: distribution.count(state) for state in set(distribution)}
 
-    if (
-        num_qubits == 2
-        and len(measured_qubits) == 1
-        and measured_qubits[0] == 0
-        and 0 in superposition_qubits
-    ):
+    if (num_qubits == 2 and len(measured_qubits) == 1 and
+            measured_qubits[0] == 0 and 0 in superposition_qubits):
         new_result = {}
         total_shots = num_shots
         half_shots = total_shots // 2
 
-        new_result["00"] = random.randint(
-            half_shots - half_shots // 4, half_shots + half_shots // 4
-        )
+        new_result["00"] = random.randint(half_shots - half_shots // 4,
+                                          half_shots + half_shots // 4)
         new_result["01"] = 0
-        new_result["10"] = random.randint(
-            half_shots - half_shots // 4, half_shots + half_shots // 4
-        )
+        new_result["10"] = random.randint(half_shots - half_shots // 4,
+                                          half_shots + half_shots // 4)
         new_result["11"] = 0
 
         remaining = total_shots - (new_result["00"] + new_result["10"])
@@ -163,7 +157,8 @@ def poll_job_status(job_id: str) -> dict[str, Any]:
 
 # v2 API: POST /jobs
 @app.post("/jobs")
-async def postJob(job: Job, x_api_key: Optional[str] = Header(None, alias="X-API-KEY")):
+async def postJob(job: Job,
+                  x_api_key: Optional[str] = Header(None, alias="X-API-KEY")):
     """Submit a quantum job for execution (v2 API)."""
     if x_api_key is None:
         raise HTTPException(status_code=401, detail="API key is required")
@@ -190,7 +185,13 @@ async def postJob(job: Job, x_api_key: Optional[str] = Header(None, alias="X-API
     JOBS_MOCK_RESULTS[newId] = counts
 
     # v2 response: wrapped in success/data envelope
-    return {"success": True, "data": {"jobQrn": newId, "status": "INITIALIZING"}}
+    return {
+        "success": True,
+        "data": {
+            "jobQrn": newId,
+            "status": "INITIALIZING"
+        }
+    }
 
 
 # Test-only: arm a failure for the next submitted job.
@@ -210,8 +211,8 @@ async def armDelayResults(count: int = Path(...)):
 # v2 API: GET /jobs/{job_qrn}
 @app.get("/jobs/{job_id}")
 async def getJob(
-    job_id: str = Path(...),
-    x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+        job_id: str = Path(...),
+        x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
 ):
     """Retrieve the status of a quantum job (v2 API)."""
     if x_api_key is None:
@@ -226,8 +227,8 @@ async def getJob(
 # v2 API: GET /jobs/{job_qrn}/program
 @app.get("/jobs/{job_id}/program")
 async def getJobProgram(
-    job_id: str = Path(...),
-    x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+        job_id: str = Path(...),
+        x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
 ):
     """Retrieve the program of a quantum job (v2 API)."""
     if x_api_key is None:
@@ -251,8 +252,8 @@ async def getJobProgram(
 # v2 API: GET /jobs/{job_qrn}/result
 @app.get("/jobs/{job_id}/result")
 async def getJobResult(
-    job_id: str = Path(...),
-    x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
+        job_id: str = Path(...),
+        x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
 ):
     """Retrieve the results of a quantum job (v2 API)."""
     if x_api_key is None:
@@ -263,14 +264,16 @@ async def getJobResult(
 
     if JOBS_MOCK_DB[job_id]["status"] in {"FAILED", "CANCELLED"}:
         raise HTTPException(
-            status_code=409, detail="Results unavailable. Job failed or was cancelled."
-        )
+            status_code=409,
+            detail="Results unavailable. Job failed or was cancelled.")
 
     if JOBS_MOCK_DB[job_id]["status"] != "COMPLETED":
         # v2: use success=false instead of "error" field
         return {
             "success": False,
-            "data": {"status": JOBS_MOCK_DB[job_id]["status"]},
+            "data": {
+                "status": JOBS_MOCK_DB[job_id]["status"]
+            },
         }
 
     if job_id not in JOBS_MOCK_RESULTS:
@@ -283,8 +286,10 @@ async def getJobResult(
         return {
             "success": False,
             "data": {
-                "status": "COMPLETED",
-                "message": "Failed to retrieve job results. Please wait, and try again.",
+                "status":
+                    "COMPLETED",
+                "message":
+                    "Failed to retrieve job results. Please wait, and try again.",
             },
         }
 
@@ -294,7 +299,9 @@ async def getJobResult(
     return {
         "success": True,
         "data": {
-            "resultData": {"measurementCounts": counts},
+            "resultData": {
+                "measurementCounts": counts
+            },
             "status": "COMPLETED",
             "cost": 0,
             "timeStamps": {},

From 8867cd080d78918edff0cd48bcc0ae57fe34ced9 Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Wed, 15 Apr 2026 06:33:45 -0700
Subject: [PATCH 07/85] Clean up ghcr-ci deployments in clean_up workflow
 (#4324)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The deployments cleanup job only removes `default` environment
deployments
but not `ghcr-ci` ones. Every CI run creates multiple ghcr-ci
deployments
via dev_environment.yml, leaving "copy-pr-bot temporarily deployed to
ghcr-ci — Inactive" entries cluttering PR timelines.
Extend the existing cleanup loop to also delete ghcr-ci deployments.
The production `ghcr-deployment` environment used by deployments.yml
is not affected.

Signed-off-by: mitchdz <mitch_dz@hotmail.com>
---
 .github/workflows/clean_up.yml | 40 +++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/clean_up.yml b/.github/workflows/clean_up.yml
index ebca2c0aaa2..a5bedab7c05 100644
--- a/.github/workflows/clean_up.yml
+++ b/.github/workflows/clean_up.yml
@@ -144,6 +144,8 @@ jobs:
   # Since we use the same workflows during CI, a default environment that defines
   # the necessary variables is used instead. Unfortunately, this automatically
   # also creates an (unwanted) deployment, which we delete with this job.
+  # The ghcr-ci environment similarly produces unwanted deployment entries
+  # from the dev_environment workflow during CI runs on pull requests.
   # See also https://github.com/actions/runner/issues/2120
   deployments:
     name: Deployments
@@ -155,26 +157,28 @@ jobs:
       - uses: actions/github-script@v7
         with:
           script: |
-            const deployments = await github.rest.repos.listDeployments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              environment: 'default'
-            });
-            await Promise.all(
-              deployments.data.map(async (deployment) => {
-                await github.rest.repos.createDeploymentStatus({ 
-                owner: context.repo.owner, 
-                repo: context.repo.repo, 
-                deployment_id: deployment.id, 
-                state: 'inactive' 
-                });
-                return github.rest.repos.deleteDeployment({
+            for (const environment of ['default', 'ghcr-ci']) {
+              const deployments = await github.rest.repos.listDeployments({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
-                deployment_id: deployment.id
-                });
-              })
-            );
+                environment: environment
+              });
+              await Promise.all(
+                deployments.data.map(async (deployment) => {
+                  await github.rest.repos.createDeploymentStatus({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  deployment_id: deployment.id,
+                  state: 'inactive'
+                  });
+                  return github.rest.repos.deleteDeployment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  deployment_id: deployment.id
+                  });
+                })
+              );
+            }
 
   pr_cleanup:
     name: Clean up documentation previews

From 48a1feb3bb722f40043bbd021765a039895f0799 Mon Sep 17 00:00:00 2001
From: Thomas Alexander <thomasalexander2718@gmail.com>
Date: Wed, 15 Apr 2026 11:12:23 -0300
Subject: [PATCH 08/85] Fix decomposition pattern selection for unbounded
 control counts (#4320)

Fixes #4319. The basis-driven pattern selection in
`decomposition{basis=...}`
failed to select decomposition chains involving `SToR1` and
`TToR1` because these patterns were registered with `s(1)`/`t(1)`
metadata (controlled-only) despite their implementations handling
any control count. The graph lookup in
`DecompositionPatternSelection.cpp` used exact hash matching on
`OperatorInfo`, so an unbounded `(n)` entry could not match a
concrete control count. This left `CCX` gates undecomposed
 when `t` was not directly in the target basis.

The fix updates `SToR1`/`TToR1`/`R1ToU3`/`U3ToRotations`
registration to `(n)` and adds `OperatorInfo::matches()` for
wildcard control count matching in `incomingPatterns()` and
`findGateDist()`.

Signed-off-by: Thomas Alexander <talexander@nvidia.com>
---
 .../DecompositionPatternSelection.cpp         | 64 ++++++++++++++-----
 .../Transforms/DecompositionPatterns.cpp      | 15 ++---
 .../tests/backends/test_circuit_opt_bench.py  | 20 ++++++
 .../DecompositionPatternSelectionTest.cpp     | 20 ++++++
 4 files changed, 95 insertions(+), 24 deletions(-)

diff --git a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
index fda09bc8db2..9fac90636bf 100644
--- a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
@@ -63,6 +63,21 @@ struct OperatorInfo {
     return name == other.name && numControls == other.numControls &&
            isAdj == other.isAdj;
   }
+
+  bool isUnbounded() const {
+    return numControls == std::numeric_limits<std::size_t>::max();
+  }
+
+  /// Check if this gate matches another, treating unbounded (n) control
+  /// count as a wildcard that matches any concrete count.
+  bool matches(const OperatorInfo &other) const {
+    if (name != other.name || isAdj != other.isAdj)
+      return false;
+    constexpr auto unbounded = std::numeric_limits<std::size_t>::max();
+    if (numControls == unbounded || other.numControls == unbounded)
+      return true;
+    return numControls == other.numControls;
+  }
 };
 
 struct BasisTarget : public ConversionTarget {
@@ -175,14 +190,15 @@ class DecompositionGraph {
   }
 
   /// Return all patterns that have the given gate as one of their targets.
-  ///
-  /// @param gate The gate to find incoming patterns for
-  /// @return A vector of pattern names (StringRef) whose targets include the
-  /// given gate
-  llvm::ArrayRef<std::string> incomingPatterns(const OperatorInfo &gate) const {
-    static const llvm::SmallVector<std::string> empty;
-    auto it = targetToPatterns.find(gate);
-    return it == targetToPatterns.end() ? empty : it->second;
+  /// Uses OperatorInfo::matches() to handle unbounded (n) control counts.
+  llvm::SmallVector<std::string>
+  incomingPatterns(const OperatorInfo &gate) const {
+    llvm::SmallVector<std::string> result;
+    for (const auto &[key, patterns] : targetToPatterns) {
+      if (key.matches(gate))
+        result.append(patterns.begin(), patterns.end());
+    }
+    return result;
   }
 
   /// Select subset of patterns relevant to decomposing to the given basis
@@ -207,7 +223,12 @@ class DecompositionGraph {
 
     for (const auto &patternName : patternSelectionCache[hashVal]) {
       const auto &pattern = getPatternType(patternName);
-      patterns.add(pattern->create(patterns.getContext()));
+      // Patterns with unbounded (n) control counts get lower benefit so
+      // that specific patterns (e.g., CR1ToCX for r1(1)) are preferred
+      // when both match the same op.
+      OperatorInfo sourceInfo(pattern->getSourceOp());
+      PatternBenefit benefit = sourceInfo.isUnbounded() ? 1 : 2;
+      patterns.add(pattern->create(patterns.getContext(), benefit));
     }
   }
 
@@ -260,18 +281,29 @@ class DecompositionGraph {
       gatesToVisit.push({gate, 0, std::nullopt});
     }
 
+    /// Find the distance for a gate, handling unbounded (n) control counts.
+    /// Exact hash lookup first for the common case, then a scan when the
+    /// query or any visited entry uses unbounded controls.
+    auto findGateDist = [&](const OperatorInfo &gate) -> std::size_t {
+      auto it = visitedGates.find(gate);
+      if (it != visitedGates.end())
+        return it->second;
+      // Scan for wildcard matches (either side could be unbounded).
+      std::size_t best = std::numeric_limits<std::size_t>::max();
+      for (const auto &[visited, dist] : visitedGates) {
+        if (visited.matches(gate))
+          best = std::min(best, dist);
+      }
+      return best;
+    };
+
     /// Compute the maximum distance from a pattern's targets to the basis
     /// gates.
     auto getPatternDist = [&](const auto &pattern) {
       auto targetGates = pattern->getTargetOps();
       std::vector<std::size_t> targetDistances;
-      for (const auto &targetGate : targetGates) {
-        if (visitedGates.count(targetGate)) {
-          targetDistances.push_back(visitedGates.at(targetGate));
-        } else {
-          targetDistances.push_back(std::numeric_limits<std::size_t>::max());
-        }
-      }
+      for (const auto &targetGate : targetGates)
+        targetDistances.push_back(findGateDist(targetGate));
       return *std::max_element(targetDistances.begin(), targetDistances.end());
     };
 
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
index 1add53a2f85..755ab75af38 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
@@ -334,10 +334,9 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
   };                                                                           \
   CUDAQ_REGISTER_TYPE(cudaq::DecompositionPatternType, PATTERN##Type, PATTERN)
 
-// TODO: The decomposition patterns "SToR1", "TToR1", "R1ToU3", "U3ToRotations"
-// can handle arbitrary number of controls, but currently metadata cannot
-// capture this. The pattern types therefore only advertise them for a fixed
-// number of controls (1 for "SToR1" and "TToR1", 0 for the rest).
+// NOTE: The patterns SToR1, TToR1, R1ToU3, and U3ToRotations handle arbitrary
+// control counts and are registered with (n) metadata. R1ToRz explicitly
+// rejects controlled ops and uses bare metadata.
 
 //===----------------------------------------------------------------------===//
 // HOp decompositions
@@ -608,7 +607,7 @@ struct R1ToU3 : public cudaq::DecompositionPattern<R1ToU3Type, quake::R1Op> {
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(R1ToU3, "r1", "u3");
+REGISTER_DECOMPOSITION_PATTERN(R1ToU3, "r1(n)", "u3(n)");
 
 // quake.r1<adj> (θ) target
 // ─────────────────────────────────
@@ -800,7 +799,7 @@ struct SToR1 : public cudaq::DecompositionPattern<SToR1Type, quake::SOp> {
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(SToR1, "s(1)", "r1(1)");
+REGISTER_DECOMPOSITION_PATTERN(SToR1, "s(n)", "r1(n)");
 
 //===----------------------------------------------------------------------===//
 // TOp decompositions
@@ -881,7 +880,7 @@ struct TToR1 : public cudaq::DecompositionPattern<TToR1Type, quake::TOp> {
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(TToR1, "t(1)", "r1(1)");
+REGISTER_DECOMPOSITION_PATTERN(TToR1, "t(n)", "r1(n)");
 
 //===----------------------------------------------------------------------===//
 // XOp decompositions
@@ -1818,7 +1817,7 @@ struct U3ToRotations
     return success();
   }
 };
-REGISTER_DECOMPOSITION_PATTERN(U3ToRotations, "u3", "rz", "rx");
+REGISTER_DECOMPOSITION_PATTERN(U3ToRotations, "u3(n)", "rz(n)", "rx(n)");
 
 } // namespace
 
diff --git a/python/tests/backends/test_circuit_opt_bench.py b/python/tests/backends/test_circuit_opt_bench.py
index 938e7a5c145..c295ff12a12 100644
--- a/python/tests/backends/test_circuit_opt_bench.py
+++ b/python/tests/backends/test_circuit_opt_bench.py
@@ -76,6 +76,26 @@ def test_custom_unitary_produces_2q_gates():
         f"KAK produces at most 3 CX (6 CZ after basis change), got {two_q}")
 
 
+def test_ccx_fully_decomposed():
+    """CCX (Toffoli) must decompose to CZ basis, not remain as ccx.
+
+    The decomposition pass must select CCXToCCZ and CCZToCX patterns
+    even when t and s are not directly in the basis. Requires unbounded
+    (n) registration for SToR1/TToR1 and wildcard matching in the
+    pattern selection graph.
+    """
+    cudaq.set_target('circuit-opt-bench')
+
+    kernel = cudaq.make_kernel()
+    q = kernel.qalloc(4)
+    kernel.cx([q[0], q[1]], q[2])
+
+    resources = cudaq.estimate_resources(kernel)
+    ops = resources.to_dict()
+    assert 'ccx' not in ops, f"CCX not decomposed: {ops}"
+    assert resources.gate_count_for_arity(2) > 0
+
+
 def _make_nonlocal_cx_kernel():
     """Build a 5-qubit kernel with CX between non-adjacent qubits (q0, q4).
     On a path topology, q0 and q4 are 4 hops apart, forcing SWAP insertion."""
diff --git a/unittests/Optimizer/DecompositionPatternSelectionTest.cpp b/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
index e2007f0ed6d..03b596caeec 100644
--- a/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
+++ b/unittests/Optimizer/DecompositionPatternSelectionTest.cpp
@@ -364,6 +364,26 @@ TEST_F(FullDecompositionPatternSelectionTest, DecomposeCCXToCZ) {
   EXPECT_EQ(selectedPatterns, exp);
 }
 
+// Regression: multi-hop chain where intermediate gates (t, z(2)) are not
+// in the basis but are reachable through further patterns.
+// Chain: x(2) -> CCXToCCZ -> {h,z(2)} -> CCZToCX -> {t,x(1)}
+//        t -> TToR1 -> {r1(1)} -> CR1ToCX -> {r1,x(1)}
+//        r1 -> R1ToU3 -> {u3} -> U3ToRotations -> {rz,rx}
+TEST_F(FullDecompositionPatternSelectionTest, DecomposeCCXDeepChain) {
+  std::vector<std::string> targetBasis{"h", "rx", "ry", "rz", "x", "x(1)"};
+  auto selectedPatterns = selectPatterns(targetBasis);
+
+  EXPECT_TRUE(std::find(selectedPatterns.begin(), selectedPatterns.end(),
+                        "CCXToCCZ") != selectedPatterns.end())
+      << "CCXToCCZ not selected";
+  EXPECT_TRUE(std::find(selectedPatterns.begin(), selectedPatterns.end(),
+                        "CCZToCX") != selectedPatterns.end())
+      << "CCZToCX not selected";
+  EXPECT_TRUE(std::find(selectedPatterns.begin(), selectedPatterns.end(),
+                        "TToR1") != selectedPatterns.end())
+      << "TToR1 not selected";
+}
+
 //===----------------------------------------------------------------------===//
 // Test OperatorInfo adjoint parsing
 //===----------------------------------------------------------------------===//

From 06a903dbbcf5d88aa7fd6997c7e3bcfb1268823c Mon Sep 17 00:00:00 2001
From: "Adam T. Geller" <adgeller@nvidia.com>
Date: Wed, 15 Apr 2026 15:01:26 -0700
Subject: [PATCH 09/85] Fail earlier if counting resources from IR won't be
 possible (#4332)

Signed-off-by: Adam Geller <adgeller@nvidia.com>
---
 lib/Optimizer/Transforms/ResourceCount.cpp    | 34 +++++++++++--------
 runtime/cudaq/platform/default/python/QPU.cpp |  2 +-
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/lib/Optimizer/Transforms/ResourceCount.cpp b/lib/Optimizer/Transforms/ResourceCount.cpp
index 918667f906a..ed6ce573c67 100644
--- a/lib/Optimizer/Transforms/ResourceCount.cpp
+++ b/lib/Optimizer/Transforms/ResourceCount.cpp
@@ -16,6 +16,25 @@ using namespace mlir;
 
 mlir::FailureOr<cudaq::Resources>
 cudaq::opt::countResourcesFromIR(ModuleOp module) {
+  // Check upfront whether all qubit allocations have statically known sizes.
+  // If any veq has a dynamic size we cannot count qubits statically, so bail
+  // out before running the gate-erasing pass manager.
+  std::size_t allocated = 0;
+  bool unresolvedVeq = false;
+  module.walk([&](quake::AllocaOp alloc) {
+    if (isa<quake::RefType>(alloc.getType())) {
+      allocated++;
+    } else if (auto size = quake::getVeqSize(alloc.getResult())) {
+      allocated += *size;
+    } else {
+      unresolvedVeq = true;
+    }
+  });
+  if (unresolvedVeq)
+    return failure();
+
+  // All qubit sizes are statically known — proceed to count gates and erase
+  // them from the IR so the subsequent JIT compiles a near-empty module.
   cudaq::Resources counts;
   auto countGate = [&counts](std::string gate,
                              std::vector<std::size_t> controls,
@@ -40,21 +59,6 @@ cudaq::opt::countResourcesFromIR(ModuleOp module) {
   if (failed(pmResult))
     return failure();
 
-  // Count allocated qubits from the IR.
-  std::size_t allocated = 0;
-  bool unresolvedVeq = false;
-  module.walk([&](quake::AllocaOp alloc) {
-    if (isa<quake::RefType>(alloc.getType())) {
-      allocated++;
-    } else if (auto size = quake::getVeqSize(alloc.getResult())) {
-      allocated += *size;
-    } else {
-      unresolvedVeq = true;
-    }
-  });
-  if (unresolvedVeq)
-    return failure();
   counts.setNumQubits(allocated);
-
   return counts;
 }
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 31666b46db4..a70ae4ac377 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -313,7 +313,7 @@ static void precountResources(ModuleOp module) {
     return;
   auto counts = cudaq::opt::countResourcesFromIR(module);
   if (failed(counts))
-    throw std::runtime_error("Resource count preprocessing failed.");
+    return;
   nvqir::setResourceCounts(std::move(*counts));
 }
 

From 84d15982d39a63ea20087f5fa17cf26b14281137 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Wed, 15 Apr 2026 16:05:25 -0700
Subject: [PATCH 10/85] [unittest] Update the unittest to conform to new API
 contracts. (#4330)

This updates the unittest so that cudaq::state objects are used to
capture and pass state information (amplitude vectors) into kernels. The
new API contract is that this sort of state information shall be passed
into CUDA-Q kernels as state objects and not raw vectors.

---------

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 unittests/CMakeLists.txt                 |  2 +-
 unittests/integration/builder_tester.cpp | 26 +++++++++++-------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index e84493fbb70..08b05c2fd28 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -197,7 +197,7 @@ if (cuStateVec_FOUND)
   )
   target_include_directories(test_custatevec_observe_from_sampling PRIVATE .)
   target_compile_definitions(test_custatevec_observe_from_sampling
-                             PRIVATE -DNVQIR_BACKEND_NAME=custatevec_fp32)
+    PRIVATE -DNVQIR_BACKEND_NAME=custatevec_fp32 -DCUDAQ_SIMULATION_SCALAR_FP32)
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
     target_link_options(test_custatevec_observe_from_sampling PRIVATE ${CUDAQ_FORCE_LINK_FLAG})
   endif()
diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
index d6138960a8a..35998547a97 100644
--- a/unittests/integration/builder_tester.cpp
+++ b/unittests/integration/builder_tester.cpp
@@ -1365,9 +1365,10 @@ CUDAQ_TEST(BuilderTester, checkControlledRotations) {
 
 TEST(BuilderTester, checkFromStateVector) {
   std::vector<cudaq::complex> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  cudaq::state st0{vec};
   {
     auto kernel = cudaq::make_kernel();
-    auto qubits = kernel.qalloc(vec);
+    auto qubits = kernel.qalloc(st0);
     std::cout << kernel << "\n";
     auto counts = cudaq::sample(kernel);
     counts.dump();
@@ -1381,11 +1382,10 @@ TEST(BuilderTester, checkFromStateVector) {
   }
 
   {
-    auto [kernel, initState] =
-        cudaq::make_kernel<std::vector<cudaq::complex>>();
+    auto [kernel, initState] = cudaq::make_kernel<cudaq::state *>();
     auto qubits = kernel.qalloc(initState);
     std::cout << kernel << "\n";
-    auto counts = cudaq::sample(kernel, vec);
+    auto counts = cudaq::sample(kernel, &st0);
     counts.dump();
     EXPECT_EQ(counts.size(), 2);
     std::size_t counter = 0;
@@ -1399,14 +1399,13 @@ TEST(BuilderTester, checkFromStateVector) {
   {
     // 2 qubit 11 state
     std::vector<cudaq::complex> vec{0., 0., 0., 1.};
-    auto [kernel, initState] =
-        cudaq::make_kernel<std::vector<cudaq::complex>>();
+    cudaq::state st1{vec};
+    auto [kernel, initState] = cudaq::make_kernel<cudaq::state *>();
     auto qubits = kernel.qalloc(initState);
-    // induce the need for a kron prod between
-    // [0,0,0,1] and [1, 0, 0, 0]
+    // induce the need for a kron prod between [0,0,0,1] and [1, 0, 0, 0]
     auto anotherOne = kernel.qalloc(2);
     std::cout << kernel << "\n";
-    auto counts = cudaq::sample(kernel, vec);
+    auto counts = cudaq::sample(kernel, &st1);
     counts.dump();
     EXPECT_EQ(counts.size(), 1);
     EXPECT_EQ(counts.count("1100"), 1000);
@@ -1415,14 +1414,13 @@ TEST(BuilderTester, checkFromStateVector) {
   {
     // 2 qubit 11 state
     std::vector<cudaq::complex> vec{0., 0., 0., 1.};
-    auto [kernel, initState] =
-        cudaq::make_kernel<std::vector<cudaq::complex>>();
+    cudaq::state st2{std::move(vec)};
+    auto [kernel, initState] = cudaq::make_kernel<cudaq::state *>();
     auto qubits = kernel.qalloc(initState);
-    // induce the need for a kron prod between
-    // [0,0,0,1] and [1, 0]
+    // induce the need for a kron prod between [0,0,0,1] and [1, 0]
     auto anotherOne = kernel.qalloc();
     std::cout << kernel << "\n";
-    auto counts = cudaq::sample(kernel, vec);
+    auto counts = cudaq::sample(kernel, &st2);
     counts.dump();
     EXPECT_EQ(counts.size(), 1);
     EXPECT_EQ(counts.count("110"), 1000);

From e530a2569b89c1be50c1b7c77efd31405a7075d4 Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Wed, 15 Apr 2026 21:56:35 -0700
Subject: [PATCH 11/85] Pybind -> Nanobind (#4311)

Migrating Python bindings from pybind11 to nanobind

- Adding nanobind as a submodule
   - Creating NanobindAdaptors for MLIR C-API type casters
- Keeping pybind11 only for upstream MLIR Python extensions
- Converting all `*_py.cpp ` binding files, headers,
CUDAQuantumExtension.cpp, pyDynamics, interop library, and PYSCF plugin
to nanobind

---------

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 .github/pre-commit/spelling_allowlist.txt     |   1 +
 .gitmodules                                   |   3 +
 CMakeLists.txt                                |   6 +
 docs/sphinx/using/basics/run_kernel.rst       |   2 +-
 python/cudaq/operators/scalar/scalar_op.py    |  53 +-
 python/extension/CMakeLists.txt               |  45 +-
 python/extension/CUDAQuantumExtension.cpp     |  82 ++-
 .../runtime/common/py_AnalogHamiltonian.cpp   | 112 +--
 python/runtime/common/py_AnalogHamiltonian.h  |   6 +-
 python/runtime/common/py_CustomOpRegistry.cpp |   9 +-
 python/runtime/common/py_CustomOpRegistry.h   |   6 +-
 python/runtime/common/py_EvolveResult.cpp     |  42 +-
 python/runtime/common/py_EvolveResult.h       |   6 +-
 python/runtime/common/py_ExecutionContext.cpp | 157 ++--
 python/runtime/common/py_ExecutionContext.h   |   6 +-
 python/runtime/common/py_NoiseModel.cpp       | 402 ++++++-----
 python/runtime/common/py_NoiseModel.h         |   6 +-
 python/runtime/common/py_ObserveResult.cpp    | 109 +--
 python/runtime/common/py_ObserveResult.h      |   6 +-
 python/runtime/common/py_Resources.cpp        |  59 +-
 python/runtime/common/py_Resources.h          |   6 +-
 python/runtime/common/py_SampleResult.cpp     | 104 +--
 python/runtime/common/py_SampleResult.h       |   6 +-
 python/runtime/cudaq/algorithms/py_draw.cpp   |  14 +-
 python/runtime/cudaq/algorithms/py_draw.h     |   4 +-
 python/runtime/cudaq/algorithms/py_evolve.cpp | 125 ++--
 python/runtime/cudaq/algorithms/py_evolve.h   |   6 +-
 .../cudaq/algorithms/py_observe_async.cpp     |  51 +-
 .../cudaq/algorithms/py_observe_async.h       |   4 +-
 .../runtime/cudaq/algorithms/py_optimizer.cpp | 145 ++--
 .../runtime/cudaq/algorithms/py_optimizer.h   |   6 +-
 .../cudaq/algorithms/py_resource_count.cpp    |  11 +-
 .../cudaq/algorithms/py_resource_count.h      |   4 +-
 python/runtime/cudaq/algorithms/py_run.cpp    |  54 +-
 python/runtime/cudaq/algorithms/py_run.h      |   6 +-
 .../cudaq/algorithms/py_sample_async.cpp      |  33 +-
 .../cudaq/algorithms/py_sample_async.h        |   6 +-
 .../cudaq/algorithms/py_sample_ptsbe.cpp      | 239 +++---
 .../cudaq/algorithms/py_sample_ptsbe.h        |   4 +-
 python/runtime/cudaq/algorithms/py_state.cpp  | 438 ++++++-----
 python/runtime/cudaq/algorithms/py_state.h    |   6 +-
 .../runtime/cudaq/algorithms/py_translate.cpp |   6 +-
 .../runtime/cudaq/algorithms/py_translate.h   |   6 +-
 .../runtime/cudaq/algorithms/py_unitary.cpp   |  13 +-
 python/runtime/cudaq/algorithms/py_unitary.h  |   6 +-
 python/runtime/cudaq/algorithms/py_utils.cpp  |  81 ++-
 python/runtime/cudaq/algorithms/py_utils.h    |  25 +-
 .../cudaq/domains/plugins/CMakeLists.txt      |   6 +-
 .../cudaq/domains/plugins/PySCFDriver.cpp     |  94 +--
 python/runtime/cudaq/dynamics/CMakeLists.txt  |   8 +-
 python/runtime/cudaq/dynamics/pyDynamics.cpp  | 212 +++---
 .../runtime/cudaq/operators/py_boson_op.cpp   | 497 ++++++-------
 python/runtime/cudaq/operators/py_boson_op.h  |   6 +-
 .../runtime/cudaq/operators/py_fermion_op.cpp | 494 +++++++------
 .../runtime/cudaq/operators/py_fermion_op.h   |   6 +-
 .../runtime/cudaq/operators/py_handlers.cpp   | 199 ++---
 python/runtime/cudaq/operators/py_handlers.h  |   6 +-
 python/runtime/cudaq/operators/py_helpers.cpp |  50 +-
 python/runtime/cudaq/operators/py_helpers.h   |  13 +-
 python/runtime/cudaq/operators/py_matrix.cpp  |  65 +-
 python/runtime/cudaq/operators/py_matrix.h    |   6 +-
 .../runtime/cudaq/operators/py_matrix_op.cpp  | 454 ++++++------
 python/runtime/cudaq/operators/py_matrix_op.h |   6 +-
 .../runtime/cudaq/operators/py_scalar_op.cpp  | 142 +++-
 python/runtime/cudaq/operators/py_scalar_op.h |   6 +-
 python/runtime/cudaq/operators/py_spin_op.cpp | 679 +++++++++---------
 python/runtime/cudaq/operators/py_spin_op.h   |   6 +-
 .../runtime/cudaq/operators/py_super_op.cpp   |  81 ++-
 python/runtime/cudaq/operators/py_super_op.h  |   6 +-
 .../cudaq/platform/py_alt_launch_kernel.cpp   | 344 ++++-----
 .../cudaq/platform/py_alt_launch_kernel.h     |  28 +-
 .../cudaq/qis/py_execution_manager.cpp        |  17 +-
 .../runtime/cudaq/qis/py_execution_manager.h  |   6 +-
 python/runtime/cudaq/qis/py_pauli_word.cpp    |  16 +-
 python/runtime/cudaq/qis/py_pauli_word.h      |   6 +-
 .../cudaq/target/py_runtime_target.cpp        |  66 +-
 .../runtime/cudaq/target/py_runtime_target.h  |   6 +-
 .../runtime/cudaq/target/py_testing_utils.cpp |  10 +-
 .../runtime/cudaq/target/py_testing_utils.h   |   6 +-
 python/runtime/interop/CMakeLists.txt         |   9 +-
 python/runtime/interop/PythonCppInterop.h     | 109 +--
 .../runtime/interop/PythonCppInteropDecls.h   |  91 +++
 python/runtime/mlir/py_register_dialects.cpp  | 199 ++---
 python/runtime/mlir/py_register_dialects.h    |   6 +-
 python/tests/interop/CMakeLists.txt           |   7 +-
 .../tests/interop/quantum_lib/CMakeLists.txt  |   1 -
 .../tests/interop/quantum_lib/quantum_lib.cpp |   2 -
 .../tests/interop/quantum_lib/quantum_lib.h   |   1 -
 .../test_cpp_quantum_algorithm_module.cpp     |  24 +-
 python/utils/NanobindAdaptors.h               | 472 ++++++++++++
 python/utils/OpaqueArguments.h                |  44 +-
 python/utils/PyTypes.h                        | 117 +--
 .../nlopt/nlopt-src/src/algs/stogo/global.h   |   2 +-
 runtime/cudaq/operators/matrix.cpp            |   5 +-
 runtime/cudaq/ptsbe/PTSBEExecutionData.h      |   2 +-
 runtime/cudaq/qis/pauli_word.h                |   1 +
 tpls/nanobind                                 |   1 +
 97 files changed, 4021 insertions(+), 3146 deletions(-)
 create mode 100644 python/runtime/interop/PythonCppInteropDecls.h
 create mode 100644 python/utils/NanobindAdaptors.h
 create mode 160000 tpls/nanobind

diff --git a/.github/pre-commit/spelling_allowlist.txt b/.github/pre-commit/spelling_allowlist.txt
index 64c9c045bed..70ea6508c54 100644
--- a/.github/pre-commit/spelling_allowlist.txt
+++ b/.github/pre-commit/spelling_allowlist.txt
@@ -307,6 +307,7 @@ multithreaded
 mutex
 namespace
 namespaces
+nanobind
 natively
 normalization
 nullary
diff --git a/.gitmodules b/.gitmodules
index 622993890c6..644ab8cc24f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -48,3 +48,6 @@
 [submodule "tpls/Stim"]
 	path = tpls/Stim
 	url = https://github.com/quantumlib/Stim
+[submodule "tpls/nanobind"]
+	path = tpls/nanobind
+	url = https://github.com/wjakob/nanobind.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 548bd30adcf..04a1c07db34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -713,6 +713,12 @@ if (CUDAQ_ENABLE_PYTHON)
   # Python bindings generated as part of the CUDA-Q build and bindings generated for
   # third party CUDA-Q libraries; see also https://github.com/pybind/pybind11/issues/1262
   add_subdirectory(tpls/pybind11)
+
+  # nanobind is used for all CUDA-Q Python bindings. pybind11 is retained only
+  # for upstream MLIR Python extensions (e.g., _mlirAsyncPasses) which use
+  # mlir/Bindings/Python/PybindAdaptors.h.
+  add_subdirectory(tpls/nanobind)
+
   add_subdirectory(python)
 endif()
 
diff --git a/docs/sphinx/using/basics/run_kernel.rst b/docs/sphinx/using/basics/run_kernel.rst
index 3999fb4e673..371fdd4685d 100644
--- a/docs/sphinx/using/basics/run_kernel.rst
+++ b/docs/sphinx/using/basics/run_kernel.rst
@@ -218,7 +218,7 @@ The observe function allows us to calculate expectation values for a defined qua
   The :func:`cudaq.observe` method takes a kernel and its arguments as inputs, along with a :class:`cudaq.operators.spin.SpinOperator`.
 
   Using the `cudaq.spin` module, operators may be defined as a linear combination of Pauli strings. Functions, such
-  as :func:`cudaq.spin.i`, :func:`cudaq.spin.x`, :func:`cudaq.spin.y`, :func:`cudaq.spin.z` may be used to construct more
+  as `cudaq.spin.i`, `cudaq.spin.x`, `cudaq.spin.y`, `cudaq.spin.z` may be used to construct more
   complex spin Hamiltonians on multiple qubits.
 
 .. tab:: C++
diff --git a/python/cudaq/operators/scalar/scalar_op.py b/python/cudaq/operators/scalar/scalar_op.py
index c95927e1900..2c0245badb2 100644
--- a/python/cudaq/operators/scalar/scalar_op.py
+++ b/python/cudaq/operators/scalar/scalar_op.py
@@ -7,11 +7,11 @@
 # ============================================================================ #
 
 from __future__ import annotations
-import inspect, numpy  # type: ignore
-from typing import Any, Callable, Mapping, Optional
+import numpy  # type: ignore
+from typing import Any, Callable, Mapping
 from numpy.typing import NDArray
 
-from ..helpers import NumericType, _aggregate_parameters, _args_from_kwargs, _parameter_docs
+from ..helpers import NumericType, _aggregate_parameters
 from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import ScalarOperator
 
 
@@ -21,7 +21,7 @@ def _const_init(cls, constant_value: NumericType) -> ScalarOperator:
     """
     if not isinstance(constant_value, NumericType):
         raise ValueError("argument must be a numeric constant")
-    return cls(constant_value)
+    return cls(complex(constant_value))
 
 
 ScalarOperator.const = classmethod(_const_init)
@@ -65,7 +65,7 @@ def _compose(
         if self.is_constant():
             return ScalarOperator.const(fct(self.evaluate(), other))
         generator = lambda **kwargs: fct(self.evaluate(**kwargs), other)
-        return ScalarOperator(generator, self.parameters)
+        return ScalarOperator(generator, **self.parameters)
     elif type(other) == ScalarOperator:
         if self.is_constant() and other.is_constant():
             return ScalarOperator.const(fct(self.evaluate(), other.evaluate()))
@@ -73,7 +73,7 @@ def _compose(
                                          other.evaluate(**kwargs))
         parameter_info = _aggregate_parameters(
             [self.parameters, other.parameters])
-        return ScalarOperator(generator, parameter_info)
+        return ScalarOperator(generator, **parameter_info)
     return NotImplemented
 
 
@@ -97,44 +97,3 @@ def _compose(
                                                        v2: v2 + v1)
 ScalarOperator.__rsub__ = lambda self, other: _compose(self, other, lambda v1,
                                                        v2: v2 - v1)
-
-
-def _instantiate(cls,
-                 generator: NumericType | Callable[..., NumericType],
-                 parameter_info: Optional[Mapping[str, str]] = None) -> None:
-    """
-    Instantiates a scalar operator.
-
-    Arguments:
-        generator: The value of the scalar operator as a function of its
-            parameters. The generator may take any number of complex-valued
-            arguments and must return a number. Each parameter must be passed
-            as a keyword argument when evaluating the operator. 
-    """
-    instance = super(ScalarOperator, cls).__new__(cls)
-    if isinstance(generator, NumericType):
-        instance.__init__(numpy.complex128(generator))
-    else:
-        # A variable number of arguments (i.e. `*args`) cannot be supported
-        # for generators; it would prevent proper argument handling while
-        # supporting additions and multiplication of all kinds of operators.
-        arg_spec = inspect.getfullargspec(generator)
-        if arg_spec.varargs is not None:
-            raise ValueError(
-                f"the function defining a scalar operator must not take *args")
-        if parameter_info is None:
-            parameter_info = {}
-            for arg_name in arg_spec.args + arg_spec.kwonlyargs:
-                parameter_info[arg_name] = _parameter_docs(
-                    arg_name, generator.__doc__)
-
-        def generator_wrapper(kwargs: dict[str, NumericType]):
-            generator_args, remaining_kwargs = _args_from_kwargs(
-                generator, **kwargs)
-            return generator(*generator_args, **remaining_kwargs)
-
-        instance.__init__(generator_wrapper, **parameter_info)
-    return instance
-
-
-ScalarOperator.__new__ = staticmethod(_instantiate)
diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index 52bb22a14db..dac02d47fca 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -13,6 +13,43 @@ endif()
 include(HandleLLVMOptions)
 include(AddMLIRPython)
 
+function(add_mlir_python_extension libname extname)
+  cmake_parse_arguments(ARG
+    ""
+    "INSTALL_COMPONENT;INSTALL_DIR;OUTPUT_DIRECTORY"
+    "SOURCES;LINK_LIBS"
+    ${ARGN})
+
+  # Use nanobind for CUDA-Q's own extension (_quakeDialects) and pybind11
+  # for upstream MLIR extensions (AsyncPasses, RegisterEverything, etc.).
+  if(libname MATCHES "_quakeDialects")
+    nanobind_add_module(${libname} NB_STATIC ${ARG_SOURCES})
+    target_compile_options(${libname} PRIVATE -frtti -fexceptions -Wno-cast-qual)
+  else()
+    pybind11_add_module(${libname} MODULE ${ARG_SOURCES})
+    target_compile_options(${libname} PRIVATE -frtti -fexceptions)
+  endif()
+
+  set_target_properties(${libname} PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY}
+    OUTPUT_NAME "${extname}"
+    NO_SONAME ON
+  )
+
+  target_link_libraries(${libname} PRIVATE ${ARG_LINK_LIBS})
+  target_link_options(${libname} PRIVATE
+    $<$<PLATFORM_ID:Linux>:LINKER:--exclude-libs,ALL>
+  )
+
+  if(ARG_INSTALL_DIR)
+    install(TARGETS ${libname}
+      COMPONENT ${ARG_INSTALL_COMPONENT}
+      LIBRARY DESTINATION "${ARG_INSTALL_DIR}"
+      RUNTIME DESTINATION "${ARG_INSTALL_DIR}"
+    )
+  endif()
+endfunction()
+
 # Specifies that all MLIR packages are co-located under the cudaq
 # top level package (the API has been embedded in a relocatable way).
 add_compile_definitions("MLIR_PYTHON_PACKAGE_PREFIX=cudaq.mlir.")
@@ -112,8 +149,8 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
    cudaq-mlir-runtime-headers
 )
 
-target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE 
-    ${CMAKE_SOURCE_DIR}/python 
+target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE
+    ${CMAKE_SOURCE_DIR}/python
     ${CMAKE_SOURCE_DIR}/python/utils
     ${CMAKE_SOURCE_DIR}/runtime
 )
@@ -161,6 +198,10 @@ add_mlir_python_modules(CUDAQuantumPythonModules
     CUDAQuantumPythonCAPI
   )
 
+if(TARGET nanobind-static)
+  target_compile_options(nanobind-static PRIVATE -Wno-cast-qual -Wno-covered-switch-default)
+endif()
+
 ## The Python bindings module for Quake dialect depends on CUDAQ libraries 
 ## which it can't locate since they are in "../../lib" and the 'rpath' is set
 ## to '$ORIGIN' by default.
diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp
index 679c191a7a6..ac1ca729446 100644
--- a/python/extension/CUDAQuantumExtension.cpp
+++ b/python/extension/CUDAQuantumExtension.cpp
@@ -43,24 +43,26 @@
 #include "runtime/cudaq/qis/py_pauli_word.h"
 #include "runtime/cudaq/target/py_runtime_target.h"
 #include "runtime/cudaq/target/py_testing_utils.h"
-#include "runtime/interop/PythonCppInterop.h"
+#include "runtime/interop/PythonCppInteropDecls.h"
 #include "runtime/mlir/py_register_dialects.h"
 #include "utils/LinkedLibraryHolder.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include <pybind11/complex.h>
-#include <pybind11/pytypes.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
 static std::unique_ptr<LinkedLibraryHolder> holder;
 
-PYBIND11_MODULE(_quakeDialects, m) {
+NB_MODULE(_quakeDialects, m) {
   holder = std::make_unique<LinkedLibraryHolder>();
 
   bindRegisterDialects(m);
@@ -92,8 +94,10 @@ PYBIND11_MODULE(_quakeDialects, m) {
           holder->setTarget(*target, extraConfig);
         }
       },
-      py::arg("option") = py::none(), py::arg("emulate") = py::none(),
-      py::arg("target") = py::none(), "Initialize the CUDA-Q environment.");
+      nanobind::arg("option") = nanobind::none(),
+      nanobind::arg("emulate") = nanobind::none(),
+      nanobind::arg("target") = nanobind::none(),
+      "Initialize the CUDA-Q environment.");
 
   bindRuntimeTarget(cudaqRuntime, *holder.get());
   bindMeasureCounts(cudaqRuntime);
@@ -197,41 +201,46 @@ PYBIND11_MODULE(_quakeDialects, m) {
   auto orcaSubmodule = cudaqRuntime.def_submodule("orca");
   orcaSubmodule.def(
       "sample",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, std::vector<double> &, int,
-                        std::size_t>(&orca::sample),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              std::vector<double> &, int, std::size_t>(
+          &orca::sample),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("ps_angles"), py::arg("n_samples") = 10000,
-      py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("ps_angles"),
+      nanobind::arg("n_samples") = 10000, nanobind::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, int, std::size_t>(&orca::sample),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              int, std::size_t>(&orca::sample),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("n_samples") = 10000, py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("n_samples") = 10000,
+      nanobind::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample_async",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, std::vector<double> &, int,
-                        std::size_t>(&orca::sample_async),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              std::vector<double> &, int, std::size_t>(
+          &orca::sample_async),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("ps_angles"), py::arg("n_samples") = 10000,
-      py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("ps_angles"),
+      nanobind::arg("n_samples") = 10000, nanobind::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample_async",
-      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, int, std::size_t>(
-          &orca::sample_async),
+      nanobind::overload_cast<std::vector<std::size_t> &,
+                              std::vector<std::size_t> &, std::vector<double> &,
+                              int, std::size_t>(&orca::sample_async),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
-      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("n_samples") = 10000, py::arg("qpu_id") = 0);
+      nanobind::arg("input_state"), nanobind::arg("loop_lengths"),
+      nanobind::arg("bs_angles"), nanobind::arg("n_samples") = 10000,
+      nanobind::arg("qpu_id") = 0);
 
   auto photonicsSubmodule = cudaqRuntime.def_submodule("photonics");
   photonicsSubmodule.def(
@@ -239,7 +248,7 @@ PYBIND11_MODULE(_quakeDialects, m) {
       [](std::size_t &level) {
         return getExecutionManager()->allocateQudit(level);
       },
-      "Allocate a qudit of given level.", py::arg("level"));
+      "Allocate a qudit of given level.", nanobind::arg("level"));
   photonicsSubmodule.def(
       "apply_operation",
       [](const std::string &name, std::vector<double> &params,
@@ -254,20 +263,21 @@ PYBIND11_MODULE(_quakeDialects, m) {
                                      spin_op::identity());
       },
       "Apply the input photonics operation on the target qudits.",
-      py::arg("name"), py::arg("params"), py::arg("targets"));
+      nanobind::arg("name"), nanobind::arg("params"), nanobind::arg("targets"));
   photonicsSubmodule.def(
       "measure",
       [](std::size_t level, std::size_t id, const std::string &regName) {
         return getExecutionManager()->measure(QuditInfo(level, id), regName);
       },
-      "Measure the input qudit(s).", py::arg("level"), py::arg("qudit"),
-      py::arg("register_name") = "");
+      "Measure the input qudit(s).", nanobind::arg("level"),
+      nanobind::arg("qudit"), nanobind::arg("register_name") = "");
   photonicsSubmodule.def(
       "release_qudit",
       [](std::size_t level, std::size_t id) {
         getExecutionManager()->returnQudit(QuditInfo(level, id));
       },
-      "Release a qudit of given id.", py::arg("level"), py::arg("id"));
+      "Release a qudit of given id.", nanobind::arg("level"),
+      nanobind::arg("id"));
   cudaqRuntime.def("cloneModule",
                    [](MlirModule mod) { return wrap(unwrap(mod).clone()); });
   cudaqRuntime.def("isTerminator", [](MlirOperation op) {
diff --git a/python/runtime/common/py_AnalogHamiltonian.cpp b/python/runtime/common/py_AnalogHamiltonian.cpp
index b9049f6e961..ec182338e03 100644
--- a/python/runtime/common/py_AnalogHamiltonian.cpp
+++ b/python/runtime/common/py_AnalogHamiltonian.cpp
@@ -9,78 +9,78 @@
 #include "py_AnalogHamiltonian.h"
 #include "common/AnalogHamiltonian.h"
 #include "common/JsonConvert.h"
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
 /// @brief Binds the `cudaq::ahs` classes.
-void bindAnalogHamiltonian(py::module &mod) {
+void bindAnalogHamiltonian(nanobind::module_ &mod) {
 
-  py::class_<cudaq::ahs::AtomArrangement>(mod, "AtomArrangement")
-      .def(py::init<>())
-      .def_readwrite("sites", &cudaq::ahs::AtomArrangement::sites)
-      .def_readwrite("filling", &cudaq::ahs::AtomArrangement::filling);
+  nanobind::class_<cudaq::ahs::AtomArrangement>(mod, "AtomArrangement")
+      .def(nanobind::init<>())
+      .def_rw("sites", &cudaq::ahs::AtomArrangement::sites)
+      .def_rw("filling", &cudaq::ahs::AtomArrangement::filling);
 
-  py::class_<cudaq::ahs::Setup>(mod, "SetUp")
-      .def(py::init<>())
-      .def_readwrite("ahs_register", &cudaq::ahs::Setup::ahs_register);
+  nanobind::class_<cudaq::ahs::Setup>(mod, "SetUp")
+      .def(nanobind::init<>())
+      .def_rw("ahs_register", &cudaq::ahs::Setup::ahs_register);
 
-  py::class_<cudaq::ahs::TimeSeries>(mod, "TimeSeries")
-      .def(py::init<>())
-      .def(py::init<std::vector<std::pair<double, double>>>())
-      .def_readwrite("values", &cudaq::ahs::TimeSeries::values)
-      .def_readwrite("times", &cudaq::ahs::TimeSeries::times);
+  nanobind::class_<cudaq::ahs::TimeSeries>(mod, "TimeSeries")
+      .def(nanobind::init<>())
+      .def(nanobind::init<std::vector<std::pair<double, double>>>())
+      .def_rw("values", &cudaq::ahs::TimeSeries::values)
+      .def_rw("times", &cudaq::ahs::TimeSeries::times);
 
-  py::class_<cudaq::ahs::FieldPattern>(mod, "FieldPattern")
+  nanobind::class_<cudaq::ahs::FieldPattern>(mod, "FieldPattern")
       /// NOTE: Other constructors not required from Python interface
-      .def(py::init<>())
-      .def_readwrite("patternStr", &cudaq::ahs::FieldPattern::patternStr)
-      .def_readwrite("patternVals", &cudaq::ahs::FieldPattern::patternVals);
-
-  py::class_<cudaq::ahs::PhysicalField>(mod, "PhysicalField")
-      .def(py::init<>())
-      .def_readwrite("time_series", &cudaq::ahs::PhysicalField::time_series)
-      .def_readwrite("pattern", &cudaq::ahs::PhysicalField::pattern);
-
-  py::class_<cudaq::ahs::DrivingField>(mod, "DrivingField")
-      .def(py::init<>())
-      .def_readwrite("amplitude", &cudaq::ahs::DrivingField::amplitude)
-      .def_readwrite("phase", &cudaq::ahs::DrivingField::phase)
-      .def_readwrite("detuning", &cudaq::ahs::DrivingField::detuning);
-
-  py::class_<cudaq::ahs::LocalDetuning>(mod, "LocalDetuning")
-      .def(py::init<>())
-      .def_readwrite("magnitude", &cudaq::ahs::LocalDetuning::magnitude);
-
-  py::class_<cudaq::ahs::Hamiltonian>(mod, "Hamiltonian")
-      .def(py::init<>())
-      .def_readwrite("drivingFields", &cudaq::ahs::Hamiltonian::drivingFields)
-      .def_readwrite("localDetuning", &cudaq::ahs::Hamiltonian::localDetuning);
-
-  py::class_<cudaq::ahs::Program>(mod, "Program")
-      .def(py::init<>())
-      .def_readwrite("setup", &cudaq::ahs::Program::setup)
-      .def_readwrite("hamiltonian", &cudaq::ahs::Program::hamiltonian)
+      .def(nanobind::init<>())
+      .def_rw("patternStr", &cudaq::ahs::FieldPattern::patternStr)
+      .def_rw("patternVals", &cudaq::ahs::FieldPattern::patternVals);
+
+  nanobind::class_<cudaq::ahs::PhysicalField>(mod, "PhysicalField")
+      .def(nanobind::init<>())
+      .def_rw("time_series", &cudaq::ahs::PhysicalField::time_series)
+      .def_rw("pattern", &cudaq::ahs::PhysicalField::pattern);
+
+  nanobind::class_<cudaq::ahs::DrivingField>(mod, "DrivingField")
+      .def(nanobind::init<>())
+      .def_rw("amplitude", &cudaq::ahs::DrivingField::amplitude)
+      .def_rw("phase", &cudaq::ahs::DrivingField::phase)
+      .def_rw("detuning", &cudaq::ahs::DrivingField::detuning);
+
+  nanobind::class_<cudaq::ahs::LocalDetuning>(mod, "LocalDetuning")
+      .def(nanobind::init<>())
+      .def_rw("magnitude", &cudaq::ahs::LocalDetuning::magnitude);
+
+  nanobind::class_<cudaq::ahs::Hamiltonian>(mod, "Hamiltonian")
+      .def(nanobind::init<>())
+      .def_rw("drivingFields", &cudaq::ahs::Hamiltonian::drivingFields)
+      .def_rw("localDetuning", &cudaq::ahs::Hamiltonian::localDetuning);
+
+  nanobind::class_<cudaq::ahs::Program>(mod, "Program")
+      .def(nanobind::init<>())
+      .def_rw("setup", &cudaq::ahs::Program::setup)
+      .def_rw("hamiltonian", &cudaq::ahs::Program::hamiltonian)
       .def(
           "to_json",
           [](const cudaq::ahs::Program &p) { return json(p).dump(); },
           "Convert Program to JSON");
 
-  py::class_<cudaq::ahs::ShotMetadata>(mod, "ShotMetadata")
-      .def(py::init<>())
-      .def_readwrite("shotStatus", &cudaq::ahs::ShotMetadata::shotStatus);
+  nanobind::class_<cudaq::ahs::ShotMetadata>(mod, "ShotMetadata")
+      .def(nanobind::init<>())
+      .def_rw("shotStatus", &cudaq::ahs::ShotMetadata::shotStatus);
 
-  py::class_<cudaq::ahs::ShotResult>(mod, "ShotResult")
-      .def(py::init<>())
-      .def_readwrite("preSequence", &cudaq::ahs::ShotResult::preSequence)
-      .def_readwrite("postSequence", &cudaq::ahs::ShotResult::postSequence);
+  nanobind::class_<cudaq::ahs::ShotResult>(mod, "ShotResult")
+      .def(nanobind::init<>())
+      .def_rw("preSequence", &cudaq::ahs::ShotResult::preSequence)
+      .def_rw("postSequence", &cudaq::ahs::ShotResult::postSequence);
 
-  py::class_<cudaq::ahs::ShotMeasurement>(mod, "ShotMeasurement")
-      .def(py::init<>())
-      .def_readwrite("shotMetadata", &cudaq::ahs::ShotMeasurement::shotMetadata)
-      .def_readwrite("shotResult", &cudaq::ahs::ShotMeasurement::shotResult);
+  nanobind::class_<cudaq::ahs::ShotMeasurement>(mod, "ShotMeasurement")
+      .def(nanobind::init<>())
+      .def_rw("shotMetadata", &cudaq::ahs::ShotMeasurement::shotMetadata)
+      .def_rw("shotResult", &cudaq::ahs::ShotMeasurement::shotResult);
 
   /// TODO: Add other classes if needed
 }
diff --git a/python/runtime/common/py_AnalogHamiltonian.h b/python/runtime/common/py_AnalogHamiltonian.h
index bfc098955a7..a1e039a8fa5 100644
--- a/python/runtime/common/py_AnalogHamiltonian.h
+++ b/python/runtime/common/py_AnalogHamiltonian.h
@@ -6,13 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 
 /// @brief Binds the `cudaq::ahs` classes.
-void bindAnalogHamiltonian(py::module &mod);
+void bindAnalogHamiltonian(nanobind::module_ &mod);
 
 } // namespace cudaq
diff --git a/python/runtime/common/py_CustomOpRegistry.cpp b/python/runtime/common/py_CustomOpRegistry.cpp
index 35c5132695b..6d09cd8d69b 100644
--- a/python/runtime/common/py_CustomOpRegistry.cpp
+++ b/python/runtime/common/py_CustomOpRegistry.cpp
@@ -7,9 +7,10 @@
  ******************************************************************************/
 #include "py_CustomOpRegistry.h"
 #include "common/CustomOp.h"
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 struct py_unitary_operation : public unitary_operation {
@@ -22,7 +23,7 @@ struct py_unitary_operation : public unitary_operation {
   }
 };
 
-void bindCustomOpRegistry(py::module &mod) {
+void bindCustomOpRegistry(nanobind::module_ &mod) {
   mod.def(
       "register_custom_operation",
       [&](const std::string &opName) {
diff --git a/python/runtime/common/py_CustomOpRegistry.h b/python/runtime/common/py_CustomOpRegistry.h
index dcd4f2c2b2e..f9b6d2003eb 100644
--- a/python/runtime/common/py_CustomOpRegistry.h
+++ b/python/runtime/common/py_CustomOpRegistry.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the custom operation registry to Python.
-void bindCustomOpRegistry(py::module &mod);
+void bindCustomOpRegistry(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_EvolveResult.cpp b/python/runtime/common/py_EvolveResult.cpp
index 007acf6577e..6a57cebaa92 100644
--- a/python/runtime/common/py_EvolveResult.cpp
+++ b/python/runtime/common/py_EvolveResult.cpp
@@ -9,36 +9,36 @@
 #include "py_EvolveResult.h"
 #include "common/EvolveResult.h"
 #include "cudaq/algorithms/evolve_internal.h"
-#include <optional>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 /// @brief Bind the `cudaq::evolve_result` and `cudaq::async_evolve_result`
 /// data classes to python as `cudaq.EvolveResult` and
 /// `cudaq.AsyncEvolveResult`.
-void bindEvolveResult(py::module &mod) {
-  py::class_<evolve_result>(
+void bindEvolveResult(nanobind::module_ &mod) {
+  nanobind::class_<evolve_result>(
       mod, "EvolveResult",
       "Stores the execution data from an invocation of :func:`evolve`.\n")
       // IMPORTANT: state overloads must be provided before vector<state>
       // overloads. Otherwise, Python might try to access the __len__ of state
       // during overload resolution. __len__ is not always well-defined for all
       // state types and may raise an exception.
-      .def(py::init<state>())
-      .def(py::init<state, std::vector<observe_result>>())
-      .def(py::init<state, std::vector<double>>())
-      .def(py::init<std::vector<state>>())
-      .def(py::init<std::vector<state>,
-                    std::vector<std::vector<observe_result>>>())
-      .def(py::init<std::vector<state>, std::vector<std::vector<double>>>())
+      .def(nanobind::init<state>())
+      .def(nanobind::init<state, std::vector<observe_result>>())
+      .def(nanobind::init<state, std::vector<double>>())
+      .def(nanobind::init<std::vector<state>>())
+      .def(nanobind::init<std::vector<state>,
+                          std::vector<std::vector<observe_result>>>())
+      .def(nanobind::init<std::vector<state>,
+                          std::vector<std::vector<double>>>())
       .def(
           "final_state",
-          [](evolve_result &self) -> py::object {
+          [](evolve_result &self) -> nanobind::object {
             if (!self.states.has_value() || self.states->empty())
-              return py::none();
-            return py::cast(self.states->back());
+              return nanobind::none();
+            return nanobind::cast(self.states->back());
           },
           "Stores the final state produced by a call to :func:`evolve`. "
           "Represent the state of a quantum system after time evolution under "
@@ -54,11 +54,11 @@ void bindEvolveResult(py::module &mod) {
           ":func:`evolve`.\n")
       .def(
           "final_expectation_values",
-          [](evolve_result &self) -> py::object {
+          [](evolve_result &self) -> nanobind::object {
             if (!self.expectation_values.has_value() ||
                 self.expectation_values->empty())
-              return py::none();
-            return py::cast(self.expectation_values->back());
+              return nanobind::none();
+            return nanobind::cast(self.expectation_values->back());
           },
           "Stores the final expectation values, that is the results produced "
           "by "
@@ -81,12 +81,12 @@ void bindEvolveResult(py::module &mod) {
           "if no intermediate results were requested, or if no observables "
           "were specified in the call.\n");
 
-  py::class_<async_evolve_result>(
+  nanobind::class_<async_evolve_result>(
       mod, "AsyncEvolveResult",
       "Stores the execution data from an invocation of :func:`evolve_async`.\n")
       .def(
           "get", [](async_evolve_result &self) { return self.get(); },
-          py::call_guard<py::gil_scoped_release>(),
+          nanobind::call_guard<nanobind::gil_scoped_release>(),
           "Retrieve the evolution result from the asynchronous evolve "
           "execution\n.");
 }
diff --git a/python/runtime/common/py_EvolveResult.h b/python/runtime/common/py_EvolveResult.h
index 48ddfb9950b..1bafe73cd2d 100644
--- a/python/runtime/common/py_EvolveResult.h
+++ b/python/runtime/common/py_EvolveResult.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Binds `cudaq.EvolveResult` and `cudaq.AsyncEvolveResult`.
-void bindEvolveResult(py::module &mod);
+void bindEvolveResult(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_ExecutionContext.cpp b/python/runtime/common/py_ExecutionContext.cpp
index 83b71fbea5e..132462462de 100644
--- a/python/runtime/common/py_ExecutionContext.cpp
+++ b/python/runtime/common/py_ExecutionContext.cpp
@@ -12,10 +12,10 @@
 #include "cudaq/utils/cudaq_utils.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include <fmt/core.h>
-#include <pybind11/complex.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace nvqir {
 std::string_view getQirOutputLog();
@@ -28,30 +28,29 @@ class PersistJITEngine {};
 
 namespace cudaq {
 
-void bindExecutionContext(py::module &mod) {
-  py::class_<cudaq::ExecutionContext>(mod, "ExecutionContext")
-      .def(py::init<std::string>())
-      .def(py::init<std::string, std::size_t, std::size_t>(), py::arg("name"),
-           py::arg("shots"), py::arg("qpu_id") = 0)
-      .def_readwrite("kernelName", &cudaq::ExecutionContext::kernelName)
-      .def_readonly("result", &cudaq::ExecutionContext::result)
-      .def_readwrite("asyncExec", &cudaq::ExecutionContext::asyncExec)
-      .def_readonly("asyncResult", &cudaq::ExecutionContext::asyncResult)
-      .def_readwrite("hasConditionalsOnMeasureResults",
-                     &cudaq::ExecutionContext::hasConditionalsOnMeasureResults)
-      .def_readwrite("totalIterations",
-                     &cudaq::ExecutionContext::totalIterations)
-      .def_readwrite("batchIteration", &cudaq::ExecutionContext::batchIteration)
-      .def_readwrite("numberTrajectories",
-                     &cudaq::ExecutionContext::numberTrajectories)
-      .def_readwrite("explicitMeasurements",
-                     &cudaq::ExecutionContext::explicitMeasurements)
-      .def_readwrite("allowJitEngineCaching",
-                     &cudaq::ExecutionContext::allowJitEngineCaching)
-      .def_readwrite("useParametricJit",
-                     &cudaq::ExecutionContext::useParametricJit)
-      .def_readonly("invocationResultBuffer",
-                    &cudaq::ExecutionContext::invocationResultBuffer)
+void bindExecutionContext(nanobind::module_ &mod) {
+  nanobind::class_<cudaq::ExecutionContext>(mod, "ExecutionContext")
+      .def(nanobind::init<std::string>())
+      .def(nanobind::init<std::string, std::size_t, std::size_t>(),
+           nanobind::arg("name"), nanobind::arg("shots"),
+           nanobind::arg("qpu_id") = 0)
+      .def_rw("kernelName", &cudaq::ExecutionContext::kernelName)
+      .def_ro("result", &cudaq::ExecutionContext::result)
+      .def_rw("asyncExec", &cudaq::ExecutionContext::asyncExec)
+      .def_ro("asyncResult", &cudaq::ExecutionContext::asyncResult)
+      .def_rw("hasConditionalsOnMeasureResults",
+              &cudaq::ExecutionContext::hasConditionalsOnMeasureResults)
+      .def_rw("totalIterations", &cudaq::ExecutionContext::totalIterations)
+      .def_rw("batchIteration", &cudaq::ExecutionContext::batchIteration)
+      .def_rw("numberTrajectories",
+              &cudaq::ExecutionContext::numberTrajectories)
+      .def_rw("explicitMeasurements",
+              &cudaq::ExecutionContext::explicitMeasurements)
+      .def_rw("allowJitEngineCaching",
+              &cudaq::ExecutionContext::allowJitEngineCaching)
+      .def_rw("useParametricJit", &cudaq::ExecutionContext::useParametricJit)
+      .def_ro("invocationResultBuffer",
+              &cudaq::ExecutionContext::invocationResultBuffer)
       .def("unset_jit_engine",
            [&](cudaq::ExecutionContext &execCtx) {
              if (execCtx.jitEng) {
@@ -68,44 +67,50 @@ void bindExecutionContext(py::module &mod) {
            [](cudaq::ExecutionContext &ctx) { return ctx.expectationValue; })
       // ----- Context management using with blocks -----
       // Unlike in C++, we do not support nested execution contexts in Python.
-      .def("__enter__",
-           [](cudaq::ExecutionContext &ctx) -> ExecutionContext & {
-             if (cudaq::getExecutionContext()) {
-               throw std::runtime_error("Context already set. Nested execution "
-                                        "contexts are not supported in Python");
-             }
-             auto &platform = cudaq::get_platform();
-             platform.configureExecutionContext(ctx);
-             cudaq::detail::setExecutionContext(&ctx);
-             platform.beginExecution();
-             return ctx;
-           })
-      .def("__exit__", [](cudaq::ExecutionContext &ctx, py::object type,
-                          py::object value, py::object traceback) {
-        if (type.is_none()) {
-          // Normal exit: finalize results, clean up the simulator,
-          // and reset the context (guaranteed even if finalize throws).
-          auto &platform = cudaq::get_platform();
-          detail::try_finally(
-              [&] {
+      .def(
+          "__enter__",
+          [](cudaq::ExecutionContext &ctx) -> ExecutionContext & {
+            if (cudaq::getExecutionContext()) {
+              throw std::runtime_error("Context already set. Nested execution "
+                                       "contexts are not supported in Python");
+            }
+            auto &platform = cudaq::get_platform();
+            platform.configureExecutionContext(ctx);
+            cudaq::detail::setExecutionContext(&ctx);
+            platform.beginExecution();
+            return ctx;
+          },
+          nanobind::rv_policy::reference)
+      .def(
+          "__exit__",
+          [](cudaq::ExecutionContext &ctx, nanobind::object type,
+             nanobind::object value, nanobind::object traceback) {
+            if (type.is_none()) {
+              // Normal exit: finalize results, clean up the simulator,
+              // and reset the context (guaranteed even if finalize throws).
+              auto &platform = cudaq::get_platform();
+              detail::try_finally(
+                  [&] {
+                    platform.finalizeExecutionContext(ctx);
+                    platform.endExecution();
+                  },
+                  detail::resetExecutionContext);
+            } else {
+              // The kernel threw. Still need to tear down the platform so
+              // the simulator doesn't carry stale state into the next run.
+              // Separate invoke_no_throw so the context reset always runs.
+              detail::invoke_no_throw([&] {
+                auto &platform = cudaq::get_platform();
                 platform.finalizeExecutionContext(ctx);
                 platform.endExecution();
-              },
-              detail::resetExecutionContext);
-        } else {
-          // The kernel threw. Still need to tear down the platform so
-          // the simulator doesn't carry stale state into the next run.
-          // Separate invoke_no_throw so the context reset always runs.
-          detail::invoke_no_throw([&] {
-            auto &platform = cudaq::get_platform();
-            platform.finalizeExecutionContext(ctx);
-            platform.endExecution();
-          });
-          // Always reset context, even if the above cleanup failed.
-          detail::invoke_no_throw(detail::resetExecutionContext);
-        }
-        return false;
-      });
+              });
+              // Always reset context, even if the above cleanup failed.
+              detail::invoke_no_throw(detail::resetExecutionContext);
+            }
+            return false;
+          },
+          nanobind::arg("type").none(), nanobind::arg("value").none(),
+          nanobind::arg("traceback").none());
   mod.def("supportsExplicitMeasurements", []() {
     auto &platform = cudaq::get_platform();
     return platform.supports_explicit_measurements();
@@ -121,33 +126,35 @@ void bindExecutionContext(py::module &mod) {
         return !isRemoteSimulator &&
                (platform.is_remote() || platform.is_emulated());
       },
-      py::arg("qpuId") = 0);
+      nanobind::arg("qpuId") = 0);
   mod.def("getQirOutputLog", []() { return nvqir::getQirOutputLog(); });
   mod.def("clearQirOutputLog", []() { nvqir::clearQirOutputLog(); });
   mod.def("decodeQirOutputLog",
-          [](const std::string &outputLog, py::buffer decodedResults) {
+          [](const std::string &outputLog, nanobind::bytearray decodedResults) {
             cudaq::RecordLogParser parser;
             parser.parse(outputLog);
-            auto info = decodedResults.request();
-            // Get the buffer and length of buffer (in bytes) from the parser.
             auto *origBuffer = parser.getBufferPtr();
             const std::size_t bufferSize = parser.getBufferSize();
-            std::memcpy(info.ptr, origBuffer, bufferSize);
+            std::memcpy(decodedResults.data(), origBuffer, bufferSize);
           });
 
-  py::class_<PersistJITEngine>(
+  nanobind::class_<PersistJITEngine>(
       mod, "reuse_compiler_artifacts",
       "Within this context, CUDAQ will blindly reuse compiled objects."
       "It is up to the user to ensure that there are never two distinct"
       "computations launched within a single context.")
-      .def(py::init())
+      .def(nanobind::init<>())
       .def("__enter__",
            [](PersistJITEngine &ctx) -> void {
              cudaq::compiler_artifact::enablePersistentJITEngine();
            })
-      .def("__exit__", [](PersistJITEngine &ctx, py::object type,
-                          py::object value, py::object traceback) {
-        cudaq::compiler_artifact::disablePersistentJITEngine();
-      });
+      .def(
+          "__exit__",
+          [](PersistJITEngine &ctx, nanobind::object type,
+             nanobind::object value, nanobind::object traceback) {
+            cudaq::compiler_artifact::disablePersistentJITEngine();
+          },
+          nanobind::arg("type").none(), nanobind::arg("value").none(),
+          nanobind::arg("traceback").none());
 }
 } // namespace cudaq
diff --git a/python/runtime/common/py_ExecutionContext.h b/python/runtime/common/py_ExecutionContext.h
index d4004941135..7df4e909b43 100644
--- a/python/runtime/common/py_ExecutionContext.h
+++ b/python/runtime/common/py_ExecutionContext.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindExecutionContext(py::module &mod);
+void bindExecutionContext(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_NoiseModel.cpp b/python/runtime/common/py_NoiseModel.cpp
index 82563f86284..cf4f96b85cc 100644
--- a/python/runtime/common/py_NoiseModel.cpp
+++ b/python/runtime/common/py_NoiseModel.cpp
@@ -10,43 +10,41 @@
 #include "common/NoiseModel.h"
 #include "cudaq.h"
 #include <iostream>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
-/// @brief Extract the array data from a buffer_info into our
+/// @brief Extract the array data from a 2-d ndarray into our
 /// own allocated data pointer.
 /// This supports 2-d array in either row or column major.
-void extractKrausData(py::buffer_info &info, complex *data) {
-  if (info.format != py::format_descriptor<complex>::format())
-    throw std::runtime_error(
-        "Incompatible buffer format, must be np.complex128.");
-
-  if (info.ndim != 2)
-    throw std::runtime_error("Incompatible buffer shape " +
-                             std::to_string(info.ndim) + ".");
+void extractKrausData(nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
+                                        nanobind::c_contig>
+                          arr,
+                      complex *data) {
+  auto rows = arr.shape(0);
+  auto cols = arr.shape(1);
+  auto *srcData = static_cast<const std::complex<double> *>(arr.data());
 
   constexpr bool rowMajor = true;
-  typedef Eigen::MatrixXcd::Scalar Scalar;
   typedef Eigen::Matrix<std::complex<double>, Eigen::Dynamic, Eigen::Dynamic,
                         Eigen::RowMajor>
       RowMajorMat;
   auto strides = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-      info.strides[rowMajor ? 0 : 1] / (py::ssize_t)sizeof(Scalar),
-      info.strides[rowMajor ? 1 : 0] / (py::ssize_t)sizeof(Scalar));
-  auto map =
-      Eigen::Map<RowMajorMat, 0, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>>(
-          static_cast<Scalar *>(info.ptr), info.shape[0], info.shape[1],
-          strides);
+      arr.stride(rowMajor ? 0 : 1), arr.stride(rowMajor ? 1 : 0));
+  auto map = Eigen::Map<const RowMajorMat, 0,
+                        Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>>(
+      srcData, rows, cols, strides);
   RowMajorMat eigenMat(map);
-  memcpy(data, eigenMat.data(),
-         sizeof(complex) * (info.shape[0] * info.shape[1]));
+  memcpy(data, eigenMat.data(), sizeof(complex) * (rows * cols));
 }
 
 /// @brief Bind the cudaq::noise_model, kraus_op, and kraus_channel.
-void bindNoiseModel(py::module &mod) {
+void bindNoiseModel(nanobind::module_ &mod) {
 
   mod.def("set_noise", &set_noise, "Set the underlying noise model.");
   mod.def("unset_noise", &unset_noise,
@@ -54,86 +52,87 @@ void bindNoiseModel(py::module &mod) {
   mod.def(
       "get_noise", []() { return cudaq::get_platform().get_noise(); },
       "Get the underlying noise model.");
-  py::class_<noise_model>(
+  nanobind::class_<noise_model>(
       mod, "NoiseModel",
       "The `NoiseModel` defines a set of :class:`KrausChannel`'s applied to "
       "specific qubits after the invocation of specified quantum operations.")
-      .def(py::init<>([mod]() {
-             // Create the noise model
-             auto model = std::make_unique<noise_model>();
-
-             // Define a map of channel names to generator functions
-             static std::map<std::string, std::function<kraus_channel(
-                                              const std::vector<double> &)>>
-                 channelGenerators = {
-                     {"DepolarizationChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return depolarization_channel(p);
-                      }},
-                     {"AmplitudeDampingChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return amplitude_damping_channel(p);
-                      }},
-                     {"BitFlipChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return bit_flip_channel(p);
-                      }},
-                     {"PhaseFlipChannel",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return phase_flip_channel(p);
-                      }},
-                     {"XError",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return x_error(p);
-                      }},
-                     {"YError",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return y_error(p);
-                      }},
-                     {"ZError",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return z_error(p);
-                      }},
-                     {"PhaseDamping",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return phase_damping(p);
-                      }},
-                     {"Pauli1",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return pauli1(p);
-                      }},
-                     {"Pauli2",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return pauli2(p);
-                      }},
-                     {"Depolarization1",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return depolarization1(p);
-                      }},
-                     {"Depolarization2",
-                      [](const std::vector<double> &p) -> kraus_channel {
-                        return depolarization2(p);
-                      }}};
-
-             // Register each channel generator
-             for (const auto &[name, generator] : channelGenerators) {
-               if (py::hasattr(mod, name.c_str())) {
-                 py::type channelType = py::getattr(mod, name.c_str());
-                 auto key = py::hash(channelType);
-                 model->register_channel(key, generator);
-               }
-             }
-
-             return model;
-           }),
-           "Construct a noise model with all built-in channels pre-registered.")
+      .def(
+          "__init__",
+          [mod](noise_model *self) {
+            new (self) noise_model();
+
+            // Define a map of channel names to generator functions
+            static std::map<std::string, std::function<kraus_channel(
+                                             const std::vector<double> &)>>
+                channelGenerators = {
+                    {"DepolarizationChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return depolarization_channel(p);
+                     }},
+                    {"AmplitudeDampingChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return amplitude_damping_channel(p);
+                     }},
+                    {"BitFlipChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return bit_flip_channel(p);
+                     }},
+                    {"PhaseFlipChannel",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return phase_flip_channel(p);
+                     }},
+                    {"XError",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return x_error(p);
+                     }},
+                    {"YError",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return y_error(p);
+                     }},
+                    {"ZError",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return z_error(p);
+                     }},
+                    {"PhaseDamping",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return phase_damping(p);
+                     }},
+                    {"Pauli1",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return pauli1(p);
+                     }},
+                    {"Pauli2",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return pauli2(p);
+                     }},
+                    {"Depolarization1",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return depolarization1(p);
+                     }},
+                    {"Depolarization2",
+                     [](const std::vector<double> &p) -> kraus_channel {
+                       return depolarization2(p);
+                     }}};
+
+            // Register each channel generator
+            for (const auto &[name, generator] : channelGenerators) {
+              if (nanobind::hasattr(mod, name.c_str())) {
+                nanobind::type_object channelType =
+                    nanobind::borrow<nanobind::type_object>(
+                        nanobind::getattr(mod, name.c_str()));
+                auto key = nanobind::hash(channelType);
+                self->register_channel(key, generator);
+              }
+            }
+          },
+          "Construct a noise model with all built-in channels pre-registered.")
       .def(
           "register_channel",
-          [](noise_model &self, const py::type krausT) {
-            auto key = py::hash(krausT);
+          [](noise_model &self, const nanobind::type_object krausT) {
+            auto key = nanobind::hash(krausT);
             std::function<kraus_channel(const std::vector<double> &)> lambda =
                 [krausT](const std::vector<double> &p) -> kraus_channel {
-              return krausT(p).cast<kraus_channel>();
+              return nanobind::cast<kraus_channel>(krausT(p));
             };
             self.register_channel(key, lambda);
           },
@@ -144,7 +143,8 @@ void bindNoiseModel(py::module &mod) {
              std::vector<std::size_t> &qubits, kraus_channel &channel) {
             self.add_channel(opName, qubits, channel);
           },
-          py::arg("operator"), py::arg("qubits"), py::arg("channel"),
+          nanobind::arg("operator"), nanobind::arg("qubits"),
+          nanobind::arg("channel"),
           R"#(Add the given :class:`KrausChannel` to be applied after invocation
 of the specified quantum operation.
 
@@ -159,7 +159,7 @@ of the specified quantum operation.
              const noise_model::PredicateFuncTy &pre) {
             self.add_channel(opName, pre);
           },
-          py::arg("operator"), py::arg("pre"),
+          nanobind::arg("operator"), nanobind::arg("pre"),
           R"#(Add the given :class:`KrausChannel` generator callback to be applied after invocation
 of the specified quantum operation.
 
@@ -173,7 +173,8 @@ of the specified quantum operation.
              std::size_t num_controls = 0) {
             self.add_all_qubit_channel(opName, channel, num_controls);
           },
-          py::arg("operator"), py::arg("channel"), py::arg("num_controls") = 0,
+          nanobind::arg("operator"), nanobind::arg("channel"),
+          nanobind::arg("num_controls") = 0,
 
           R"#(Add the given :class:`KrausChannel` to be applied after invocation
 of the specified quantum operation on arbitrary qubits.
@@ -189,7 +190,7 @@ of the specified quantum operation on arbitrary qubits.
              const std::vector<std::size_t> &qubits) {
             return self.get_channels(op, qubits);
           },
-          py::arg("operator"), py::arg("qubits"),
+          nanobind::arg("operator"), nanobind::arg("qubits"),
           "Return the :class:`KrausChannel`'s that make up this noise model.")
       .def(
           "get_channels",
@@ -198,35 +199,44 @@ of the specified quantum operation on arbitrary qubits.
              const std::vector<std::size_t> &controls) {
             return self.get_channels(op, qubits, controls);
           },
-          py::arg("operator"), py::arg("qubits"), py::arg("controls"),
+          nanobind::arg("operator"), nanobind::arg("qubits"),
+          nanobind::arg("controls"),
           "Return the :class:`KrausChannel`'s that make up this noise model.");
 }
 
-void bindKrausOp(py::module &mod) {
-  py::class_<kraus_op>(
-      mod, "KrausOperator", py::buffer_protocol(),
+void bindKrausOp(nanobind::module_ &mod) {
+  nanobind::class_<kraus_op>(
+      mod, "KrausOperator",
       "The `KrausOperator` is represented by a matrix and serves as an element "
       "of a quantum channel such that :code:`Sum Ki Ki^dag = I.`")
-      .def_buffer([](kraus_op &op) -> py::buffer_info {
-        return py::buffer_info(op.data.data(), sizeof(complex),
-                               py::format_descriptor<complex>::format(), 2,
-                               {op.nRows, op.nCols},
-                               {sizeof(complex) * op.nCols, sizeof(complex)});
-      })
-      .def(py::init([](const py::buffer &b) {
-             py::buffer_info info = b.request();
-             std::vector<complex> v(info.shape[0] * info.shape[1]);
-             extractKrausData(info, v.data());
-             return kraus_op(v);
-           }),
-           "Create a :class:`KrausOperator` from a buffer of data, like a "
-           "numpy array.")
-      .def_readonly("row_count", &kraus_op::nRows,
-                    "The number of rows in the matrix representation of this "
-                    ":class:`KrausOperator`.")
-      .def_readonly("col_count", &kraus_op::nCols,
-                    "The number of columns in the matrix representation of "
-                    "this :class:`KrausOperator`.");
+      .def(
+          "__array__",
+          [](kraus_op &op, nanobind::object dtype_obj,
+             nanobind::object copy_obj) {
+            size_t shape[2] = {op.nRows, op.nCols};
+            return nanobind::ndarray<nanobind::numpy, std::complex<double>>(
+                op.data.data(), 2, shape, nanobind::handle());
+          },
+          nanobind::arg("dtype") = nanobind::none(),
+          nanobind::arg("copy") = nanobind::none())
+      .def(
+          "__init__",
+          [](kraus_op *self,
+             nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
+                               nanobind::c_contig>
+                 arr) {
+            std::vector<complex> v(arr.shape(0) * arr.shape(1));
+            extractKrausData(arr, v.data());
+            new (self) kraus_op(v);
+          },
+          "Create a :class:`KrausOperator` from a buffer of data, like a "
+          "numpy array.")
+      .def_ro("row_count", &kraus_op::nRows,
+              "The number of rows in the matrix representation of this "
+              ":class:`KrausOperator`.")
+      .def_ro("col_count", &kraus_op::nCols,
+              "The number of columns in the matrix representation of "
+              "this :class:`KrausOperator`.");
 }
 
 // Need a trampoline class to make this sub-class-able from Python
@@ -235,8 +245,8 @@ class PyKrausChannel : public kraus_channel {
   using kraus_channel::kraus_channel;
 };
 
-void bindNoiseChannels(py::module &mod) {
-  py::enum_<cudaq::noise_model_type>(mod, "NoiseModelType")
+void bindNoiseChannels(nanobind::module_ &mod) {
+  nanobind::enum_<cudaq::noise_model_type>(mod, "NoiseModelType")
       .value("Unknown", cudaq::noise_model_type::unknown)
       .value("DepolarizationChannel",
              cudaq::noise_model_type::depolarization_channel)
@@ -254,46 +264,50 @@ void bindNoiseChannels(py::module &mod) {
       .value("Depolarization1", cudaq::noise_model_type::depolarization1)
       .value("Depolarization2", cudaq::noise_model_type::depolarization2);
 
-  py::class_<kraus_channel, PyKrausChannel>(
-      mod, "KrausChannel", py::dynamic_attr(),
+  nanobind::class_<kraus_channel, PyKrausChannel>(
+      mod, "KrausChannel",
       "The `KrausChannel` is composed of a list of "
       ":class:`KrausOperator`'s and "
       "is applied to a specific qubit or set of qubits.")
-      .def(py::init<>(), "Create an empty :class:`KrausChannel`")
-      .def(py::init<const std::vector<kraus_op> &>(),
+      .def(nanobind::init<>(), "Create an empty :class:`KrausChannel`")
+      .def(nanobind::init<const std::vector<kraus_op> &>(),
            "Create a :class:`KrausChannel` composed of a list of "
            ":class:`KrausOperator`'s.")
-      .def(py::init([](py::list ops) {
-             std::vector<kraus_op> kops;
-             for (std::size_t i = 0; i < ops.size(); i++) {
-               auto buffer = ops[i].cast<py::buffer>();
-               auto info = buffer.request();
-               auto shape = info.shape;
-               std::vector<complex> v(shape[0] * shape[1]);
-               extractKrausData(info, v.data());
-               kops.emplace_back(v);
-             }
-             return kraus_channel(kops);
-           }),
-           "Create a :class:`KrausChannel` given a list of "
-           ":class:`KrausOperator`'s.")
-      .def_readwrite("parameters", &kraus_channel::parameters)
-      .def_readwrite("noise_type", &kraus_channel::noise_type)
+      .def(
+          "__init__",
+          [](kraus_channel *self, nanobind::list ops) {
+            std::vector<kraus_op> kops;
+            for (std::size_t i = 0; i < ops.size(); i++) {
+              auto arr = nanobind::cast<nanobind::ndarray<
+                  std::complex<double>, nanobind::ndim<2>, nanobind::c_contig>>(
+                  ops[i]);
+              auto rows = arr.shape(0);
+              auto cols = arr.shape(1);
+              std::vector<complex> v(rows * cols);
+              extractKrausData(arr, v.data());
+              kops.emplace_back(v);
+            }
+            new (self) kraus_channel(kops);
+          },
+          "Create a :class:`KrausChannel` given a list of "
+          ":class:`KrausOperator`'s.")
+      .def_rw("parameters", &kraus_channel::parameters)
+      .def_rw("noise_type", &kraus_channel::noise_type)
       .def("get_ops", &kraus_channel::get_ops,
            "Return the :class:`KrausOperator`'s in this :class:`KrausChannel`.")
       .def(
           "__getitem__",
           [](kraus_channel &self, std::size_t idx) { return self[idx]; },
-          py::arg("index"),
+          nanobind::arg("index"),
           "Return the :class:`KrausOperator` at the given index in this "
           ":class:`KrausChannel`.")
       .def(
           "append",
           [](kraus_channel &self, kraus_op op) { self.push_back(op); },
-          py::arg("operator"),
+          nanobind::arg("operator"),
           "Add a :class:`KrausOperator` to this :class:`KrausChannel`.");
 
-  py::class_<depolarization_channel, kraus_channel>(
+  nanobind::class_<depolarization_channel, kraus_channel>(
       mod, "DepolarizationChannel",
       R"#(Models the decoherence of the qubit state and phase into a mixture "
       of the computational basis states, `|0>` and `|1>`.
@@ -318,15 +332,15 @@ void bindNoiseChannels(py::module &mod) {
       For `probability = 0.0`, the channel will behave noise-free.
       For `probability = 0.75`, the channel will fully depolarize the state.
       For `probability = 1.0`, the channel will be uniform.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `DepolarizationChannel` with the provided "
            "`probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &depolarization_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<amplitude_damping_channel, kraus_channel>(
+  nanobind::class_<amplitude_damping_channel, kraus_channel>(
       mod, "AmplitudeDampingChannel",
       R"#(Models the dissipation of energy due to system interactions with the
       environment.
@@ -341,15 +355,15 @@ void bindNoiseChannels(py::module &mod) {
       representing the probability that the qubit will decay to its ground
       state. The probability of the qubit remaining in the same state is
       therefore `1 - probability`.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `AmplitudeDampingChannel` with the provided "
            "`probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &amplitude_damping_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<bit_flip_channel, kraus_channel>(
+  nanobind::class_<bit_flip_channel, kraus_channel>(
       mod, "BitFlipChannel",
       R"#(Models the decoherence of the qubit state. Its constructor expects a
       float value, `probability`, representing the probability that the qubit
@@ -364,14 +378,14 @@ void bindNoiseChannels(py::module &mod) {
 
       The probability of the qubit remaining in the same state is therefore `1 -
       probability`.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `BitFlipChannel` with the provided `probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &bit_flip_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<phase_flip_channel, kraus_channel>(
+  nanobind::class_<phase_flip_channel, kraus_channel>(
       mod, "PhaseFlipChannel",
       R"#(Models the decoherence of the qubit phase. Its constructor expects a
       float value, `probability`, representing the probability of a random
@@ -385,95 +399,95 @@ void bindNoiseChannels(py::module &mod) {
 
       The probability of the qubit phase remaining untouched is therefore
       `1 - probability`.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>(), py::arg("probability"),
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>(), nanobind::arg("probability"),
            "Initialize the `PhaseFlipChannel` with the provided `probability`.")
-      .def_readonly_static(
+      .def_ro_static(
           "num_parameters", &phase_flip_channel::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<phase_damping, kraus_channel>(
+  nanobind::class_<phase_damping, kraus_channel>(
       mod, "PhaseDamping",
       R"#(A Kraus channel that models the single-qubit phase damping error. This
       is similar to AmplitudeDamping, but for phase.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &phase_damping::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<z_error, kraus_channel>(
+  nanobind::class_<z_error, kraus_channel>(
       mod, "ZError",
       R"#(A Pauli error that applies the Z operator when an error
       occurs. It is the same as PhaseFlipChannel.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &z_error::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<x_error, kraus_channel>(
+  nanobind::class_<x_error, kraus_channel>(
       mod, "XError",
       R"#(A Pauli error that applies the X operator when an error
       occurs. It is the same as BitFlipChannel.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &x_error::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<y_error, kraus_channel>(
+  nanobind::class_<y_error, kraus_channel>(
       mod, "YError",
       R"#(A Pauli error that applies the Y operator when an error
       occurs.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &y_error::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<pauli1, kraus_channel>(
+  nanobind::class_<pauli1, kraus_channel>(
       mod, "Pauli1",
       R"#(A single-qubit Pauli error that applies either an X error, Y error,
       or Z error. The probability of each X, Y, or Z error is supplied as a
       parameter.)#")
-      .def(py::init<std::vector<double>>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def_ro_static(
           "num_parameters", &pauli1::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<pauli2, kraus_channel>(
+  nanobind::class_<pauli2, kraus_channel>(
       mod, "Pauli2",
       R"#(A 2-qubit Pauli error that applies one of the following errors, with
       the probabilities specified as a vector. Possible errors: IX, IY, IZ, XI, XX,
       XY, XZ, YI, YX, YY, YZ, ZI, ZX, ZY, and ZZ.)#")
-      .def(py::init<std::vector<double>>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def_ro_static(
           "num_parameters", &pauli2::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<depolarization1, kraus_channel>(
+  nanobind::class_<depolarization1, kraus_channel>(
       mod, "Depolarization1",
       R"#(The same as DepolarizationChannel (single qubit depolarization))#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &depolarization1::num_parameters,
           "The number of parameters this channel requires at construction.");
 
-  py::class_<depolarization2, kraus_channel>(
+  nanobind::class_<depolarization2, kraus_channel>(
       mod, "Depolarization2",
       R"#(A 2-qubit depolarization error that applies one of the following
       errors. Possible errors: IX, IY, IZ, XI, XX, XY, XZ, YI, YX, YY, YZ, ZI, ZX,
       ZY, and ZZ.)#")
-      .def(py::init<std::vector<double>>())
-      .def(py::init<double>())
-      .def_readonly_static(
+      .def(nanobind::init<std::vector<double>>())
+      .def(nanobind::init<double>())
+      .def_ro_static(
           "num_parameters", &depolarization2::num_parameters,
           "The number of parameters this channel requires at construction.");
 }
 
-void bindNoise(py::module &mod) {
+void bindNoise(nanobind::module_ &mod) {
   bindNoiseModel(mod);
   bindKrausOp(mod);
   bindNoiseChannels(mod);
diff --git a/python/runtime/common/py_NoiseModel.h b/python/runtime/common/py_NoiseModel.h
index 75c0f0b8df7..cc03a52e138 100644
--- a/python/runtime/common/py_NoiseModel.h
+++ b/python/runtime/common/py_NoiseModel.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the cudaq::noise_model data-type to Python.
-void bindNoise(py::module &mod);
+void bindNoise(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_ObserveResult.cpp b/python/runtime/common/py_ObserveResult.cpp
index d26185673ee..5383391b9dc 100644
--- a/python/runtime/common/py_ObserveResult.cpp
+++ b/python/runtime/common/py_ObserveResult.cpp
@@ -11,21 +11,23 @@
 #include "common/ObserveResult.h"
 #include "cudaq/algorithms/observe.h"
 
-namespace py = pybind11;
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+
 namespace {
 // FIXME(OperatorCpp): Remove this when the operator class is implemented in
 // C++
-cudaq::spin_op to_spin_op(py::object &obj) {
-  if (py::hasattr(obj, "_to_spinop"))
-    return obj.attr("_to_spinop")().cast<cudaq::spin_op>();
-  return obj.cast<cudaq::spin_op>();
+cudaq::spin_op to_spin_op(nanobind::object &obj) {
+  if (nanobind::hasattr(obj, "_to_spinop"))
+    return nanobind::cast<cudaq::spin_op>(obj.attr("_to_spinop")());
+  return nanobind::cast<cudaq::spin_op>(obj);
 }
-cudaq::spin_op to_spin_op_term(py::object &obj) {
+cudaq::spin_op to_spin_op_term(nanobind::object &obj) {
   auto op = cudaq::spin_op::empty();
-  if (py::hasattr(obj, "_to_spinop"))
-    op = obj.attr("_to_spinop")().cast<cudaq::spin_op>();
+  if (nanobind::hasattr(obj, "_to_spinop"))
+    op = nanobind::cast<cudaq::spin_op>(obj.attr("_to_spinop")());
   else
-    op = obj.cast<cudaq::spin_op>();
+    op = nanobind::cast<cudaq::spin_op>(obj);
   if (op.num_terms() != 1)
     throw std::invalid_argument("expecting a spin op with a single term");
   return *op.begin();
@@ -46,21 +48,23 @@ namespace cudaq {
 /// @brief Bind the `cudaq::observe_result` and `cudaq::async_observe_result`
 /// data classes to python as `cudaq.ObserveResult` and
 /// `cudaq.AsyncObserveResult`.
-void bindObserveResult(py::module &mod) {
-  py::class_<observe_result>(
+void bindObserveResult(nanobind::module_ &mod) {
+  nanobind::class_<observe_result>(
       mod, "ObserveResult",
       "A data-type containing the results of a call to :func:`observe`. "
       "This includes any measurement counts data, as well as the global "
       "expectation value of the user-defined `spin_operator`.\n")
-      .def(py::init<double, spin_op, sample_result>())
-      .def(py::init(
-          [](double exp_val, const spin_op &spin_op, sample_result result) {
-            return observe_result(exp_val, spin_op, result);
-          }))
-      .def(py::init(
-          [](double exp_val, py::object spin_op, sample_result result) {
-            return observe_result(exp_val, to_spin_op(spin_op), result);
-          }))
+      .def(nanobind::init<double, spin_op, sample_result>())
+      .def("__init__",
+           [](observe_result *self, double exp_val, const spin_op &spin_op,
+              sample_result result) {
+             new (self) observe_result(exp_val, spin_op, result);
+           })
+      .def("__init__",
+           [](observe_result *self, double exp_val, nanobind::object spin_op,
+              sample_result result) {
+             new (self) observe_result(exp_val, to_spin_op(spin_op), result);
+           })
       /// @brief Bind the member functions of `cudaq.ObserveResult`.
       .def("dump", &observe_result::dump,
            "Dump the raw data from the :class:`SampleResult` that are stored "
@@ -79,18 +83,18 @@ void bindObserveResult(py::module &mod) {
           [](observe_result &self, const spin_op_term &sub_term) {
             return self.counts(sub_term);
           },
-          py::arg("sub_term"), "")
+          nanobind::arg("sub_term"), "")
       .def(
           "counts",
-          [](observe_result &self, py::object sub_term) {
+          [](observe_result &self, nanobind::object sub_term) {
             return self.counts(to_spin_op_term(sub_term));
           },
-          py::arg("sub_term"),
-          R"#(Given a `sub_term` of the global `spin_operator` that was passed 
+          nanobind::arg("sub_term"),
+          R"#(Given a `sub_term` of the global `spin_operator` that was passed
 to :func:`observe`, return its measurement counts.
 
 Args:
-  sub_term (`SpinOperator`): An individual sub-term of the 
+  sub_term (`SpinOperator`): An individual sub-term of the
     `spin_operator`.
 
 Returns:
@@ -104,7 +108,7 @@ to :func:`observe`, return its measurement counts.
                 1);
             return self.counts(sub_term);
           },
-          py::arg("sub_term"),
+          nanobind::arg("sub_term"),
           "Deprecated - ensure to pass a SpinOperatorTerm instead of a "
           "SpinOperator")
       .def(
@@ -117,22 +121,22 @@ to :func:`observe`, return its measurement counts.
           [](observe_result &self, const spin_op_term &spin_term) {
             return self.expectation(spin_term);
           },
-          py::arg("sub_term"), "")
+          nanobind::arg("sub_term"), "")
       .def(
           "expectation",
-          [](observe_result &self, py::object spin_term) {
+          [](observe_result &self, nanobind::object spin_term) {
             return self.expectation(to_spin_op_term(spin_term));
           },
-          py::arg("sub_term"),
-          R"#(Return the expectation value of an individual `sub_term` of the 
+          nanobind::arg("sub_term"),
+          R"#(Return the expectation value of an individual `sub_term` of the
 global `spin_operator` that was passed to :func:`observe`.
 
 Args:
-  sub_term (:class:`SpinOperatorTerm`): An individual sub-term of the 
+  sub_term (:class:`SpinOperatorTerm`): An individual sub-term of the
     `spin_operator`.
 
 Returns:
-  float : The expectation value of the `sub_term` with respect to the 
+  float : The expectation value of the `sub_term` with respect to the
   :class:`Kernel` that was passed to :func:`observe`.)#")
       .def(
           "expectation",
@@ -144,36 +148,37 @@ global `spin_operator` that was passed to :func:`observe`.
 
             return self.expectation(spin_term);
           },
-          py::arg("sub_term"),
+          nanobind::arg("sub_term"),
           "Deprecated - ensure to pass a SpinOperatorTerm instead of a "
           "SpinOperator");
 
-  py::class_<async_observe_result>(
+  nanobind::class_<async_observe_result>(
       mod, "AsyncObserveResult",
-      R"#(A data-type containing the results of a call to :func:`observe_async`. 
-      
-The `AsyncObserveResult` contains a future, whose :class:`ObserveResult` 
-may be returned via an invocation of the `get` method. 
+      R"#(A data-type containing the results of a call to :func:`observe_async`.
+
+The `AsyncObserveResult` contains a future, whose :class:`ObserveResult`
+may be returned via an invocation of the `get` method.
 
 This kicks off a wait on the current thread until the results are available.
 
 See `future <https://en.cppreference.com/w/cpp/thread/future>`_
 for more information on this programming pattern.)#")
-      .def(py::init([](std::string inJson, spin_op op) {
-        async_observe_result f(&op);
-        std::istringstream is(inJson);
-        is >> f;
-        return f;
-      }))
-      .def(py::init([](std::string inJson, py::object op) {
-        auto as_spin_op = to_spin_op(op);
-        async_observe_result f(&as_spin_op);
-        std::istringstream is(inJson);
-        is >> f;
-        return f;
-      }))
+      .def("__init__",
+           [](async_observe_result *self, std::string inJson, spin_op op) {
+             new (self) async_observe_result(&op);
+             std::istringstream is(inJson);
+             is >> *self;
+           })
+      .def("__init__",
+           [](async_observe_result *self, std::string inJson,
+              nanobind::object op) {
+             auto as_spin_op = to_spin_op(op);
+             new (self) async_observe_result(&as_spin_op);
+             std::istringstream is(inJson);
+             is >> *self;
+           })
       .def("get", &async_observe_result::get,
-           py::call_guard<py::gil_scoped_release>(),
+           nanobind::call_guard<nanobind::gil_scoped_release>(),
            "Returns the :class:`ObserveResult` from the asynchronous observe "
            "execution.")
       .def("__str__", [](async_observe_result &self) {
diff --git a/python/runtime/common/py_ObserveResult.h b/python/runtime/common/py_ObserveResult.h
index 920a09c78e1..823d0b0ee6a 100644
--- a/python/runtime/common/py_ObserveResult.h
+++ b/python/runtime/common/py_ObserveResult.h
@@ -6,11 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Binds `cudaq.ObserveResult` and `cudaq.AsyncObserveResult`.
-void bindObserveResult(py::module &mod);
+void bindObserveResult(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_Resources.cpp b/python/runtime/common/py_Resources.cpp
index c777b185aaa..07098a83377 100644
--- a/python/runtime/common/py_Resources.cpp
+++ b/python/runtime/common/py_Resources.cpp
@@ -6,8 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "py_Resources.h"
 
@@ -17,14 +20,14 @@
 
 namespace cudaq {
 
-void bindResources(py::module &mod) {
+void bindResources(nanobind::module_ &mod) {
   using namespace cudaq;
 
-  py::class_<Resources>(
+  nanobind::class_<Resources>(
       mod, "Resources",
-      R"#(A data-type containing the results of a call to :func:`estimate_resources`. 
+      R"#(A data-type containing the results of a call to :func:`estimate_resources`.
 This includes all gate counts.)#")
-      .def(py::init<>())
+      .def(nanobind::init<>())
       .def(
           "dump", [](Resources &self) { self.dump(); },
           "Print a string of the raw resource counts data to the "
@@ -59,41 +62,33 @@ This includes all gate counts.)#")
           "to_dict", [](Resources &self) { return self.gateCounts(); },
           "Return a dictionary of the raw resource counts that are stored in "
           "`self`.\n")
-      .def_property_readonly(
-          "num_qubits", &Resources::getNumQubits,
-          "The total number of qubits allocated in the kernel.\n")
-      .def_property_readonly(
-          "num_used_qubits", &Resources::getNumUsedQubits,
-          "The number of qubits touched by at least one quantum "
-          "operation.\n")
-      .def_property_readonly(
-          "depth", &Resources::getCircuitDepth,
-          "The circuit depth (longest gate chain on any qubit).\n")
-      .def_property_readonly(
+      .def_prop_ro("num_qubits", &Resources::getNumQubits,
+                   "The total number of qubits allocated in the kernel.\n")
+      .def_prop_ro("num_used_qubits", &Resources::getNumUsedQubits,
+                   "The number of qubits touched by at least one quantum "
+                   "operation.\n")
+      .def_prop_ro("depth", &Resources::getCircuitDepth,
+                   "The circuit depth (longest gate chain on any qubit).\n")
+      .def_prop_ro(
           "gate_count_by_arity",
-          [](Resources &self) {
-            return py::dict(py::cast(self.getGateCountsByArity()));
-          },
+          [](Resources &self) { return self.getGateCountsByArity(); },
           "Gate counts by qubit arity, as a dict mapping arity to count.\n")
       .def("gate_count_for_arity", &Resources::getGateCountByArity,
-           py::arg("arity"),
+           nanobind::arg("arity"),
            "Get gate count for a specific qubit arity (total qubits "
            "including controls and targets). Returns 0 if no gates of "
            "that arity exist.")
-      .def("depth_for_arity", &Resources::getDepthByArity, py::arg("arity"),
+      .def("depth_for_arity", &Resources::getDepthByArity,
+           nanobind::arg("arity"),
            "Get circuit depth considering only gates of a specific qubit "
            "arity. Returns 0 if no gates of that arity exist.")
-      .def_property_readonly("multi_qubit_gate_count",
-                             &Resources::getMultiQubitGateCount,
-                             "Total count of gates with 2 or more qubits.\n")
-      .def_property_readonly("multi_qubit_depth",
-                             &Resources::getMultiQubitDepth,
-                             "Max depth across all gate widths >= 2.\n")
-      .def_property_readonly(
+      .def_prop_ro("multi_qubit_gate_count", &Resources::getMultiQubitGateCount,
+                   "Total count of gates with 2 or more qubits.\n")
+      .def_prop_ro("multi_qubit_depth", &Resources::getMultiQubitDepth,
+                   "Max depth across all gate widths >= 2.\n")
+      .def_prop_ro(
           "per_qubit_depth",
-          [](Resources &self) {
-            return py::dict(py::cast(self.getPerQubitDepth()));
-          },
+          [](Resources &self) { return self.getPerQubitDepth(); },
           "Per-qubit circuit depth (all gates), as a dict mapping qubit "
           "index to depth.\n")
       .def("clear", &Resources::clear, "Clear out all metadata from `self`.\n");
diff --git a/python/runtime/common/py_Resources.h b/python/runtime/common/py_Resources.h
index cf5f6e7fdaf..4ea7546e1a3 100644
--- a/python/runtime/common/py_Resources.h
+++ b/python/runtime/common/py_Resources.h
@@ -5,11 +5,9 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind `cudaq.Resources` to python.
-void bindResources(py::module &mod);
+void bindResources(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/common/py_SampleResult.cpp b/python/runtime/common/py_SampleResult.cpp
index 6196502bb63..47b65d5226e 100644
--- a/python/runtime/common/py_SampleResult.cpp
+++ b/python/runtime/common/py_SampleResult.cpp
@@ -6,8 +6,11 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/string_view.h>
+#include <nanobind/stl/vector.h>
 
 #include "py_SampleResult.h"
 
@@ -17,26 +20,22 @@
 
 namespace cudaq {
 
-void bindMeasureCounts(py::module &mod) {
+void bindMeasureCounts(nanobind::module_ &mod) {
   using namespace cudaq;
 
   // TODO Bind the variants of this functions that take the register name
   // as input.
-  py::class_<sample_result>(
+  nanobind::class_<sample_result>(
       mod, "SampleResult",
-      R"#(A data-type containing the results of a call to :func:`sample`. 
-This includes all measurement counts data from both mid-circuit and 
+      R"#(A data-type containing the results of a call to :func:`sample`.
+This includes all measurement counts data from both mid-circuit and
 terminal measurements.
 
 Note:
-	Conditional logic on mid-circuit measurements is no longer supported with 
-  `sample`. Use `run` instead.
-
-Attributes:
-	register_names (List[str]): A list of the names of each measurement 
-		register that are stored in `self`.)#")
-      .def_property_readonly("register_names", &sample_result::register_names)
-      .def(py::init<>())
+	Conditional logic on mid-circuit measurements is no longer supported with
+  `sample`. Use `run` instead.)#")
+      .def_prop_ro("register_names", &sample_result::register_names)
+      .def(nanobind::init<>())
       .def(
           "dump", [](sample_result &self) { self.dump(); },
           "Print a string of the raw measurement counts data to the "
@@ -63,19 +62,19 @@ terminal measurements.
             auto map = self.to_map();
             auto iter = map.find(bitstring);
             if (iter == map.end())
-              throw py::key_error("bitstring '" + bitstring +
-                                  "' does not exist");
+              throw nanobind::key_error(
+                  ("bitstring '" + bitstring + "' does not exist").c_str());
 
             return iter->second;
           },
-          py::arg("bitstring"),
+          nanobind::arg("bitstring"),
           R"#(Return the measurement counts for the given `bitstring`.
 
 Args:
 	bitstring (str): The binary string to return the measurement data of.
 
 Returns:
-	float: The number of times the given `bitstring` was measured 
+	float: The number of times the given `bitstring` was measured
 	during the `shots_count` number of executions on the QPU.)#")
       .def(
           "__len__", [](sample_result &self) { return self.to_map().size(); },
@@ -84,12 +83,14 @@ terminal measurements.
       .def(
           "__iter__",
           [](sample_result &self) {
-            return py::make_key_iterator(self.begin(), self.end());
+            return nanobind::make_key_iterator(nanobind::type<sample_result>(),
+                                               "key_iterator", self.begin(),
+                                               self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Iterate through the :class:`SampleResult` dictionary.\n")
       .def("expectation", &sample_result::expectation,
-           py::arg("register_name") = GlobalRegisterName,
+           nanobind::arg("register_name") = GlobalRegisterName,
            "Return the expectation value in the Z-basis of the :class:`Kernel` "
            "that was sampled.\n")
       .def(
@@ -102,45 +103,46 @@ terminal measurements.
                          1);
             return self.expectation();
           },
-          py::arg("register_name") = GlobalRegisterName,
+          nanobind::arg("register_name") = GlobalRegisterName,
           "Return the expectation value in the Z-basis of the :class:`Kernel` "
           "that was sampled.\n")
       .def("probability", &sample_result::probability,
            "Return the probability of observing the given bit string.\n",
-           py::arg("bitstring"), py::arg("register_name") = GlobalRegisterName,
+           nanobind::arg("bitstring"),
+           nanobind::arg("register_name") = GlobalRegisterName,
            R"#(Return the probability of measuring the given `bitstring`.
 
 Args:
-  bitstring (str): The binary string to return the measurement 
+  bitstring (str): The binary string to return the measurement
 		probability of.
-  register_name (Optional[str]): The optional measurement register 
-		name to extract the probability from. Defaults to the '__global__' 
+  register_name (Optional[str]): The optional measurement register
+		name to extract the probability from. Defaults to the '__global__'
 		register.
 
 Returns:
-  float: 
-	The probability of measuring the given `bitstring`. Equivalent 
-	to the proportion of the total times the bitstring was measured 
+  float:
+	The probability of measuring the given `bitstring`. Equivalent
+	to the proportion of the total times the bitstring was measured
 	vs. the number of experiments (`shots_count`).)#")
       .def("most_probable", &sample_result::most_probable,
-           py::arg("register_name") = GlobalRegisterName,
-           R"#(Return the bitstring that was measured most frequently in the 
+           nanobind::arg("register_name") = GlobalRegisterName,
+           R"#(Return the bitstring that was measured most frequently in the
 experiment.
 
 Args:
-  register_name (Optional[str]): The optional measurement register 
-		name to extract the most probable bitstring from. Defaults to the 
+  register_name (Optional[str]): The optional measurement register
+		name to extract the most probable bitstring from. Defaults to the
 		'__global__' register.
 
 Returns:
   str: The most frequently measured binary string during the experiment.)#")
-      .def("count", &sample_result::count, py::arg("bitstring"),
-           py::arg("register_name") = GlobalRegisterName,
+      .def("count", &sample_result::count, nanobind::arg("bitstring"),
+           nanobind::arg("register_name") = GlobalRegisterName,
            R"#(Return the number of times the given bitstring was observed.
 
 Args:
   bitstring (str): The binary string to return the measurement counts for.
-  register_name (Optional[str]): The optional measurement register name to 
+  register_name (Optional[str]): The optional measurement register name to
 		extract the probability from. Defaults to the '__global__' register.
 
 Returns:
@@ -149,21 +151,21 @@ experiment.
            static_cast<sample_result (sample_result::*)(
                const std::vector<std::size_t> &, const std::string_view) const>(
                &sample_result::get_marginal),
-           py::arg("marginal_indices"), py::kw_only(),
-           py::arg("register_name") = GlobalRegisterName,
-           R"#(Extract the measurement counts data for the provided subset of 
+           nanobind::arg("marginal_indices"), nanobind::kw_only(),
+           nanobind::arg("register_name") = GlobalRegisterName,
+           R"#(Extract the measurement counts data for the provided subset of
 qubits (`marginal_indices`).
 
 Args:
-  marginal_indices (list[int]): A list of the qubit indices to extract the 
+  marginal_indices (list[int]): A list of the qubit indices to extract the
 		measurement data from.
-  register_name (Optional[str]): The optional measurement register name to extract 
+  register_name (Optional[str]): The optional measurement register name to extract
 		the counts data from. Defaults to the '__global__' register.
 Returns:
-  :class:`SampleResult`: 
+  :class:`SampleResult`:
 	A new `SampleResult` dictionary containing the extracted measurement data.)#")
       .def("get_sequential_data", &sample_result::sequential_data,
-           py::arg("register_name") = GlobalRegisterName,
+           nanobind::arg("register_name") = GlobalRegisterName,
            "Return the data from the given register (`register_name`) as it "
            "was collected sequentially. A list of measurement results, not "
            "collated into a map.\n")
@@ -174,26 +176,30 @@ qubits (`marginal_indices`).
             ExecutionResult res(cd);
             return sample_result(res);
           },
-          py::arg("register_name"),
+          nanobind::arg("register_name"),
           "Extract the provided sub-register (`register_name`) as a new "
           ":class:`SampleResult`.\n")
       .def(
           "items",
           [](sample_result &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<sample_result>(),
+                                           "item_iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Return the key/value pairs in this :class:`SampleResult` "
           "dictionary.\n")
       .def(
           "values",
           [](sample_result &self) {
-            return py::make_value_iterator(self.begin(), self.end());
+            return nanobind::make_value_iterator(
+                nanobind::type<sample_result>(), "value_iterator", self.begin(),
+                self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Return all values (the counts) in this :class:`SampleResult` "
           "dictionary.\n")
-      .def(py::self += py::self)
+      .def(nanobind::self += nanobind::self)
       .def("clear", &sample_result::clear,
            "Clear out all metadata from `self`.\n");
 }
diff --git a/python/runtime/common/py_SampleResult.h b/python/runtime/common/py_SampleResult.h
index 2cc72487900..832acf3e40c 100644
--- a/python/runtime/common/py_SampleResult.h
+++ b/python/runtime/common/py_SampleResult.h
@@ -5,13 +5,11 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 #include "utils/LinkedLibraryHolder.h"
 
-namespace py = pybind11;
-
 namespace cudaq {
 /// @brief Bind `cudaq.MeasureCounts` to python.
-void bindMeasureCounts(py::module &mod);
+void bindMeasureCounts(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_draw.cpp b/python/runtime/cudaq/algorithms/py_draw.cpp
index be13796efd9..94d01c1b151 100644
--- a/python/runtime/cudaq/algorithms/py_draw.cpp
+++ b/python/runtime/cudaq/algorithms/py_draw.cpp
@@ -11,14 +11,12 @@
 #include "cudaq/platform/nvqpp_interface.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 
-namespace py = pybind11;
-
 /// @brief Run `cudaq::contrib::draw`'s string overload on the provided kernel.
 /// \p kernel is a kernel decorator object and \p args are the arguments to
 /// launch \p kernel.
 static std::string pyDraw(const std::string &format,
                           const std::string &shortName, MlirModule mod,
-                          py::args runtimeArgs) {
+                          nanobind::args runtimeArgs) {
   if (format != "ascii" && format != "latex")
     throw std::runtime_error("format argument must be \"ascii\" or \"latex\".");
 
@@ -31,11 +29,11 @@ static std::string pyDraw(const std::string &format,
 }
 
 /// @brief Bind the draw cudaq function
-void cudaq::bindPyDraw(py::module &mod) {
+void cudaq::bindPyDraw(nanobind::module_ &mod) {
   mod.def(
       "draw_impl",
       [](const std::string &format, const std::string &shortName,
-         MlirModule mod, py::args runtimeArgs) {
+         MlirModule mod, nanobind::args runtimeArgs) {
         return pyDraw(format, shortName, mod, runtimeArgs);
       },
       R"#(
@@ -47,7 +45,7 @@ string.
 Args:
   format (str): The format of the output. Can be 'ascii' or 'latex'.
   kernel (:class:`Kernel`): The :class:`Kernel` to draw.
-  *arguments (Optional[Any]): The concrete values to evaluate the kernel 
+  *arguments (Optional[Any]): The concrete values to evaluate the kernel
       function at. Leave empty if the kernel doesn't accept any arguments.
 
 Returns:
@@ -66,12 +64,12 @@ string.
       mz(q)
   print(cudaq.draw(bell_pair))
   # Output
-  #      ╭───╮     
+  #      ╭───╮
   # q0 : ┤ h ├──●──
   #      ╰───╯╭─┴─╮
   # q1 : ─────┤ x ├
   #           ╰───╯
-  
+
   # Example with arguments
   import cudaq
   @cudaq.kernel
diff --git a/python/runtime/cudaq/algorithms/py_draw.h b/python/runtime/cudaq/algorithms/py_draw.h
index f6bd76d4a3b..cc5c37df87e 100644
--- a/python/runtime/cudaq/algorithms/py_draw.h
+++ b/python/runtime/cudaq/algorithms/py_draw.h
@@ -9,8 +9,8 @@
 #pragma once
 
 #include "utils/OpaqueArguments.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyDraw(pybind11::module &mod);
+void bindPyDraw(nanobind::module_ &mod);
 }
diff --git a/python/runtime/cudaq/algorithms/py_evolve.cpp b/python/runtime/cudaq/algorithms/py_evolve.cpp
index b243287a0db..80e54f3edc7 100644
--- a/python/runtime/cudaq/algorithms/py_evolve.cpp
+++ b/python/runtime/cudaq/algorithms/py_evolve.cpp
@@ -11,13 +11,17 @@
 #include "cudaq/algorithms/evolve_internal.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
@@ -26,17 +30,18 @@ using spin_op_creator =
     std::function<spin_op(std::map<std::string, numeric_type>)>;
 
 // Helper to determine if an object is a Python kernel builder object (PyKernel)
-static bool isPyKernelObject(py::object &kernel) {
+static bool isPyKernelObject(nanobind::object &kernel) {
   const std::string kernelTypeName =
-      py::hasattr(kernel, "__class__")
-          ? kernel.attr("__class__").attr("__name__").cast<std::string>()
+      nanobind::hasattr(kernel, "__class__")
+          ? nanobind::cast<std::string>(
+                kernel.attr("__class__").attr("__name__"))
           : "";
   return (kernelTypeName == "PyKernel");
 }
 
 template <typename numeric_type>
 evolve_result
-pyEvolve(state initial_state, py::object kernel,
+pyEvolve(state initial_state, nanobind::object kernel,
          std::map<std::string, numeric_type> params,
          std::vector<spin_op_creator<numeric_type>> observables = {},
          int shots_count = -1) {
@@ -44,11 +49,11 @@ pyEvolve(state initial_state, py::object kernel,
     throw std::runtime_error(
         "The provided kernel to pyEvolve is not a valid PyKernel object.");
 
-  if (py::hasattr(kernel, "compile"))
+  if (nanobind::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  auto kernelName = kernel.attr("name").cast<std::string>();
-  auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>());
+  auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
+  auto kernelMod = unwrap(nanobind::cast<MlirModule>(kernel.attr("module")));
 
   std::vector<spin_op> spin_ops = {};
   for (auto &observable : observables) {
@@ -70,23 +75,24 @@ pyEvolve(state initial_state, py::object kernel,
 
 template <typename numeric_type>
 evolve_result
-pyEvolve(state initial_state, std::vector<py::object> kernels,
+pyEvolve(state initial_state, std::vector<nanobind::object> kernels,
          std::vector<std::map<std::string, numeric_type>> params,
          std::vector<spin_op_creator<numeric_type>> observables = {},
          int shots_count = -1, bool save_intermediate_states = true) {
-  if (!std::all_of(kernels.begin(), kernels.end(),
-                   [](py::object &kernel) { return isPyKernelObject(kernel); }))
+  if (!std::all_of(
+          kernels.begin(), kernels.end(),
+          [](nanobind::object &kernel) { return isPyKernelObject(kernel); }))
     throw std::runtime_error(
         "One or more of the provided kernels to pyEvolve is not a valid "
         "PyKernel object.");
 
   std::vector<std::function<void(state)>> launchFcts = {};
-  for (py::object kernel : kernels) {
-    if (py::hasattr(kernel, "compile"))
+  for (nanobind::object kernel : kernels) {
+    if (nanobind::hasattr(kernel, "compile"))
       kernel.attr("compile")();
 
-    auto kernelName = kernel.attr("name").cast<std::string>();
-    auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>());
+    auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
+    auto kernelMod = unwrap(nanobind::cast<MlirModule>(kernel.attr("module")));
 
     launchFcts.push_back([kernelMod, kernelName](state state) mutable {
       auto *argData = new cudaq::OpaqueArguments();
@@ -112,7 +118,7 @@ pyEvolve(state initial_state, std::vector<py::object> kernels,
 
 template <typename numeric_type>
 async_evolve_result
-pyEvolveAsync(state initial_state, py::object kernel,
+pyEvolveAsync(state initial_state, nanobind::object kernel,
               std::map<std::string, numeric_type> params,
               std::vector<spin_op_creator<numeric_type>> observables = {},
               std::size_t qpu_id = 0,
@@ -122,18 +128,19 @@ pyEvolveAsync(state initial_state, py::object kernel,
     throw std::runtime_error(
         "The provided kernel to pyEvolveAsync is not a valid PyKernel object.");
 
-  if (py::hasattr(kernel, "compile"))
+  if (nanobind::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>()).clone();
-  auto kernelName = kernel.attr("name").cast<std::string>();
+  auto kernelMod =
+      unwrap(nanobind::cast<MlirModule>(kernel.attr("module"))).clone();
+  auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
 
   std::vector<spin_op> spin_ops = {};
   for (auto observable : observables) {
     spin_ops.push_back(observable(params));
   }
 
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return __internal__::evolve_async(
       initial_state,
       [kernelMod, kernelName](state state) mutable {
@@ -148,27 +155,29 @@ pyEvolveAsync(state initial_state, py::object kernel,
 
 template <typename numeric_type>
 async_evolve_result
-pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
+pyEvolveAsync(state initial_state, std::vector<nanobind::object> kernels,
               std::vector<std::map<std::string, numeric_type>> params,
               std::vector<spin_op_creator<numeric_type>> observables = {},
               std::size_t qpu_id = 0,
               std::optional<cudaq::noise_model> noise_model = std::nullopt,
               int shots_count = -1, bool save_intermediate_states = true) {
-  if (!std::all_of(kernels.begin(), kernels.end(),
-                   [](py::object &kernel) { return isPyKernelObject(kernel); }))
+  if (!std::all_of(
+          kernels.begin(), kernels.end(),
+          [](nanobind::object &kernel) { return isPyKernelObject(kernel); }))
     throw std::runtime_error(
         "One or more of the provided kernels to pyEvolveAsync is not a valid "
         "PyKernel object.");
 
   std::vector<std::function<void(state)>> launchFcts = {};
-  for (py::object kernel : kernels) {
-    if (py::hasattr(kernel, "compile"))
+  for (nanobind::object kernel : kernels) {
+    if (nanobind::hasattr(kernel, "compile"))
       kernel.attr("compile")();
 
     // IMPORTANT: we need to make sure no Python data is accessed in the async.
     // functor.
-    auto kernelMod = unwrap(kernel.attr("module").cast<MlirModule>()).clone();
-    auto kernelName = kernel.attr("name").cast<std::string>();
+    auto kernelMod =
+        unwrap(nanobind::cast<MlirModule>(kernel.attr("module"))).clone();
+    auto kernelName = nanobind::cast<std::string>(kernel.attr("name"));
     launchFcts.push_back(
         [kernelMod = std::move(kernelMod), kernelName](state state) mutable {
           cudaq::OpaqueArguments argData;
@@ -187,7 +196,7 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
     spin_ops.push_back(std::move(ops));
   }
 
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return __internal__::evolve_async(initial_state, launchFcts, spin_ops, qpu_id,
                                     noise_model, shots_count,
                                     save_intermediate_states);
@@ -196,7 +205,7 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
 #define DEFINE_PARAM_TYPE_OVERLOAD_VEC(type, pyMod)                            \
   pyMod.def(                                                                   \
       "evolve",                                                                \
-      [](state initial_state, std::vector<py::object> kernels,                 \
+      [](state initial_state, std::vector<nanobind::object> kernels,           \
          std::vector<std::map<std::string, type>> params = {},                 \
          std::vector<spin_op_creator<type>> observables = {},                  \
          int shots_count = -1, bool save_intermediate_states = true) {         \
@@ -205,16 +214,16 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Evolve the given initial_state with the provided kernel and "           \
       "parameters.",                                                           \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::vector<std::map<std::string, type>>{},          \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("shots_count") = -1,                                             \
-      py::arg("save_intermediate_states") = true);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::vector<std::map<std::string, type>>{},    \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("shots_count") = -1,                                       \
+      nanobind::arg("save_intermediate_states") = true);
 
 #define DEFINE_PARAM_TYPE_OVERLOAD(type, pyMod)                                \
   pyMod.def(                                                                   \
       "evolve",                                                                \
-      [](state initial_state, py::object kernel,                               \
+      [](state initial_state, nanobind::object kernel,                         \
          std::map<std::string, type> params = {},                              \
          std::vector<spin_op_creator<type>> observables = {},                  \
          int shots_count = -1) {                                               \
@@ -223,15 +232,15 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Evolve the given initial_state with the provided kernel and "           \
       "parameters.",                                                           \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::map<std::string, type>{},                       \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("shots_count") = -1);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::map<std::string, type>{},                 \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("shots_count") = -1);
 
 #define DEFINE_ASYNC_PARAM_TYPE_OVERLOAD_VEC(type, pyMod)                      \
   pyMod.def(                                                                   \
       "evolve_async",                                                          \
-      [](state initial_state, std::vector<py::object> kernels,                 \
+      [](state initial_state, std::vector<nanobind::object> kernels,           \
          std::vector<std::map<std::string, type>> params = {},                 \
          std::vector<spin_op_creator<type>> observables = {},                  \
          std::size_t qpu_id = 0,                                               \
@@ -243,17 +252,18 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Asynchronously evolve the given initial_state with "                    \
       "the provided kernel and parameters.",                                   \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::vector<std::map<std::string, type>>{},          \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("qpu_id") = 0, py::arg("noise_model") = std::nullopt,            \
-      py::arg("shots_count") = -1,                                             \
-      py::arg("save_intermediate_states") = true);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::vector<std::map<std::string, type>>{},    \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("qpu_id") = 0,                                             \
+      nanobind::arg("noise_model") = std::nullopt,                             \
+      nanobind::arg("shots_count") = -1,                                       \
+      nanobind::arg("save_intermediate_states") = true);
 
 #define DEFINE_ASYNC_PARAM_TYPE_OVERLOAD(type, pyMod)                          \
   pyMod.def(                                                                   \
       "evolve_async",                                                          \
-      [](state initial_state, py::object kernel,                               \
+      [](state initial_state, nanobind::object kernel,                         \
          std::map<std::string, type> params = {},                              \
          std::vector<spin_op_creator<type>> observables = {},                  \
          std::size_t qpu_id = 0,                                               \
@@ -264,14 +274,15 @@ pyEvolveAsync(state initial_state, std::vector<py::object> kernels,
       },                                                                       \
       "Asynchronously evolve the given initial_state with "                    \
       "the provided kernel and parameters.",                                   \
-      py::arg("initial_state"), py::arg("kernels"),                            \
-      py::arg("params") = std::map<std::string, type>{},                       \
-      py::arg("observables") = std::vector<spin_op_creator<type>>{},           \
-      py::arg("qpu_id") = 0, py::arg("noise_model") = std::nullopt,            \
-      py::arg("shots_count") = -1);
+      nanobind::arg("initial_state"), nanobind::arg("kernels"),                \
+      nanobind::arg("params") = std::map<std::string, type>{},                 \
+      nanobind::arg("observables") = std::vector<spin_op_creator<type>>{},     \
+      nanobind::arg("qpu_id") = 0,                                             \
+      nanobind::arg("noise_model") = std::nullopt,                             \
+      nanobind::arg("shots_count") = -1);
 
 /// @brief Bind the evolve cudaq function for circuit simulator
-void bindPyEvolve(py::module &mod) {
+void bindPyEvolve(nanobind::module_ &mod) {
   // Sync evolve overloads
   DEFINE_PARAM_TYPE_OVERLOAD_VEC(long, mod);
   DEFINE_PARAM_TYPE_OVERLOAD_VEC(double, mod);
diff --git a/python/runtime/cudaq/algorithms/py_evolve.h b/python/runtime/cudaq/algorithms/py_evolve.h
index 869806c41b4..4af37da5b0c 100644
--- a/python/runtime/cudaq/algorithms/py_evolve.h
+++ b/python/runtime/cudaq/algorithms/py_evolve.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyEvolve(py::module &mod);
+void bindPyEvolve(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_observe_async.cpp b/python/runtime/cudaq/algorithms/py_observe_async.cpp
index 58c5ce37454..19586bce198 100644
--- a/python/runtime/cudaq/algorithms/py_observe_async.cpp
+++ b/python/runtime/cudaq/algorithms/py_observe_async.cpp
@@ -13,14 +13,15 @@
 #include "cudaq/Todo.h"
 #include "cudaq/algorithms/observe.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <fmt/core.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
@@ -67,14 +68,14 @@ static async_observe_result pyObserveAsync(const std::string &shortName,
                                            mlir::ModuleOp mod,
                                            const spin_op &spin_operator,
                                            std::size_t qpu_id, int shots,
-                                           py::args args) {
+                                           nanobind::args args) {
   auto &platform = get_platform();
   args = simplifiedValidateInputArguments(args);
   auto fnOp = getKernelFuncOp(mod, shortName);
   auto opaques = marshal_arguments_for_module_launch(mod, args, fnOp);
 
   // Launch the asynchronous execution.
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return details::runObservationAsync(
       detail::make_copyable_function([opaques = std::move(opaques), shortName,
                                       mod = mod.clone()]() mutable {
@@ -86,17 +87,16 @@ static async_observe_result pyObserveAsync(const std::string &shortName,
       spin_operator, platform, shots, shortName, qpu_id);
 }
 
-static async_observe_result observe_async_impl(const std::string &shortName,
-                                               MlirModule module,
-                                               py::object &spin_operator_obj,
-                                               std::size_t qpu_id, int shots,
-                                               py::args args) {
+static async_observe_result
+observe_async_impl(const std::string &shortName, MlirModule module,
+                   nanobind::object &spin_operator_obj, std::size_t qpu_id,
+                   int shots, nanobind::args args) {
   // FIXME(OperatorCpp): Remove this when the operator class is implemented in
   // C++
-  spin_op spin_operator = [](py::object &obj) -> spin_op {
-    if (py::hasattr(obj, "_to_spinop"))
-      return obj.attr("_to_spinop")().cast<spin_op>();
-    return obj.cast<spin_op>();
+  spin_op spin_operator = [](nanobind::object &obj) -> spin_op {
+    if (nanobind::hasattr(obj, "_to_spinop"))
+      return nanobind::cast<spin_op>(obj.attr("_to_spinop")());
+    return nanobind::cast<spin_op>(obj);
   }(spin_operator_obj);
   auto mod = unwrap(module);
   return pyObserveAsync(shortName, mod, spin_operator, qpu_id, shots, args);
@@ -106,7 +106,7 @@ static async_observe_result observe_async_impl(const std::string &shortName,
 static observe_result
 pyObservePar(const PyParType &type, const std::string &shortName,
              mlir::ModuleOp module, spin_op &spin_operator, int shots,
-             std::optional<noise_model> noise, py::args args) {
+             std::optional<noise_model> noise, nanobind::args args) {
   // Ensure the user input is correct.
   auto &platform = get_platform();
   if (!platform.supports_task_distribution())
@@ -163,11 +163,14 @@ pyObservePar(const PyParType &type, const std::string &shortName,
 
 /// Observe can be a single observe call, a parallel observe call, or a observe
 /// broadcast. All these variants are handled here.
-static observe_result
-observe_parallel_impl(const std::string &shortName, MlirModule module,
-                      py::type execution, spin_op &spin_operator, int shots,
-                      std::optional<noise_model> noise, py::args arguments) {
-  std::string applicatorKey = py::str(execution.attr("__name__"));
+static observe_result observe_parallel_impl(const std::string &shortName,
+                                            MlirModule module,
+                                            nanobind::type_object execution,
+                                            spin_op &spin_operator, int shots,
+                                            std::optional<noise_model> noise,
+                                            nanobind::args arguments) {
+  std::string applicatorKey =
+      nanobind::cast<std::string>(execution.attr("__name__"));
   auto mod = unwrap(module);
   if (applicatorKey == "thread")
     return pyObservePar(PyParType::thread, shortName, mod, spin_operator, shots,
@@ -178,14 +181,14 @@ observe_parallel_impl(const std::string &shortName, MlirModule module,
   throw std::runtime_error("invalid parallel execution context");
 }
 
-void cudaq::bindObserveAsync(py::module &mod) {
+void cudaq::bindObserveAsync(nanobind::module_ &mod) {
   auto parallelSubmodule = mod.def_submodule("parallel");
-  py::class_<parallel::mpi>(
+  nanobind::class_<parallel::mpi>(
       parallelSubmodule, "mpi",
       "Type indicating that the :func:`observe` function should distribute its "
       "expectation value computations across available MPI ranks and GPUs for "
       "each term.");
-  py::class_<parallel::thread>(
+  nanobind::class_<parallel::thread>(
       parallelSubmodule, "thread",
       "Type indicating that the :func:`observe` function should distribute its "
       "term "
diff --git a/python/runtime/cudaq/algorithms/py_observe_async.h b/python/runtime/cudaq/algorithms/py_observe_async.h
index 44cb0a63048..ebd599b6ab3 100644
--- a/python/runtime/cudaq/algorithms/py_observe_async.h
+++ b/python/runtime/cudaq/algorithms/py_observe_async.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindObserveAsync(pybind11::module &mod);
+void bindObserveAsync(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_optimizer.cpp b/python/runtime/cudaq/algorithms/py_optimizer.cpp
index 79064fbf867..339b33e81ae 100644
--- a/python/runtime/cudaq/algorithms/py_optimizer.cpp
+++ b/python/runtime/cudaq/algorithms/py_optimizer.cpp
@@ -5,8 +5,12 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 #include "common/JsonConvert.h"
 #include "cudaq/algorithms/gradients/central_difference.h"
@@ -19,23 +23,26 @@
 
 namespace cudaq {
 
-/// @brief Bind the `cudaq::optimization_result` typedef.
-void bindOptimizationResult(py::module &mod) {
-  py::class_<optimization_result>(mod, "OptimizationResult");
+/// @brief optimization_result is a typedef for std::tuple<double,
+/// std::vector<double>> which is automatically converted by nanobind's
+/// stl/tuple type caster.
+void bindOptimizationResult(nanobind::module_ &mod) {
+  mod.attr("OptimizationResult") =
+      nanobind::handle(reinterpret_cast<PyObject *>(&PyTuple_Type));
 }
 
-void bindGradientStrategies(py::module &mod) {
+void bindGradientStrategies(nanobind::module_ &mod) {
   // Binding under the `cudaq.gradients` namespace in python.
   auto gradients_submodule = mod.def_submodule("gradients");
   // Have to bind the parent class, `cudaq::gradient`, to allow
   // for the passing of arbitrary `cudaq::gradients::` around.
   // Note: this class lives under `cudaq.gradients.gradient`
   // in python.
-  py::class_<gradient>(gradients_submodule, "gradient");
+  nanobind::class_<gradient>(gradients_submodule, "gradient");
   // Gradient strategies derive from the `cudaq::gradient` class.
-  py::class_<gradients::central_difference, gradient>(gradients_submodule,
-                                                      "CentralDifference")
-      .def(py::init<>())
+  nanobind::class_<gradients::central_difference, gradient>(gradients_submodule,
+                                                            "CentralDifference")
+      .def(nanobind::init<>())
       .def(
           "to_json",
           [](const gradients::central_difference &p) { return json(p).dump(); },
@@ -51,18 +58,20 @@ void bindGradientStrategies(py::module &mod) {
       .def(
           "compute",
           [](cudaq::gradient &grad, const std::vector<double> &x,
-             py::function &func, double funcAtX) {
+             nanobind::callable &func, double funcAtX) {
             auto function =
-                func.cast<std::function<double(std::vector<double>)>>();
+                nanobind::cast<std::function<double(std::vector<double>)>>(
+                    func);
             return grad.compute(x, function, funcAtX);
           },
-          py::arg("parameter_vector"), py::arg("function"), py::arg("funcAtX"),
+          nanobind::arg("parameter_vector"), nanobind::arg("function"),
+          nanobind::arg("funcAtX"),
           "Compute the gradient of the provided `parameter_vector` with "
           "respect to "
           "its loss function, using the `CentralDifference` method.\n");
-  py::class_<gradients::forward_difference, gradient>(gradients_submodule,
-                                                      "ForwardDifference")
-      .def(py::init<>())
+  nanobind::class_<gradients::forward_difference, gradient>(gradients_submodule,
+                                                            "ForwardDifference")
+      .def(nanobind::init<>())
       .def(
           "to_json",
           [](const gradients::forward_difference &p) { return json(p).dump(); },
@@ -78,18 +87,20 @@ void bindGradientStrategies(py::module &mod) {
       .def(
           "compute",
           [](cudaq::gradient &grad, const std::vector<double> &x,
-             py::function &func, double funcAtX) {
+             nanobind::callable &func, double funcAtX) {
             auto function =
-                func.cast<std::function<double(std::vector<double>)>>();
+                nanobind::cast<std::function<double(std::vector<double>)>>(
+                    func);
             return grad.compute(x, function, funcAtX);
           },
-          py::arg("parameter_vector"), py::arg("function"), py::arg("funcAtX"),
+          nanobind::arg("parameter_vector"), nanobind::arg("function"),
+          nanobind::arg("funcAtX"),
           "Compute the gradient of the provided `parameter_vector` with "
           "respect to "
           "its loss function, using the `ForwardDifference` method.\n");
-  py::class_<gradients::parameter_shift, gradient>(gradients_submodule,
-                                                   "ParameterShift")
-      .def(py::init<>())
+  nanobind::class_<gradients::parameter_shift, gradient>(gradients_submodule,
+                                                         "ParameterShift")
+      .def(nanobind::init<>())
       .def(
           "to_json",
           [](const gradients::parameter_shift &p) { return json(p).dump(); },
@@ -105,12 +116,14 @@ void bindGradientStrategies(py::module &mod) {
       .def(
           "compute",
           [](cudaq::gradient &grad, const std::vector<double> &x,
-             py::function &func, double funcAtX) {
+             nanobind::callable &func, double funcAtX) {
             auto function =
-                func.cast<std::function<double(std::vector<double>)>>();
+                nanobind::cast<std::function<double(std::vector<double>)>>(
+                    func);
             return grad.compute(x, function, funcAtX);
           },
-          py::arg("parameter_vector"), py::arg("function"), py::arg("funcAtX"),
+          nanobind::arg("parameter_vector"), nanobind::arg("function"),
+          nanobind::arg("funcAtX"),
           "Compute the gradient of the provided `parameter_vector` with "
           "respect to "
           "its loss function, using the `ParameterShift` method.\n");
@@ -121,9 +134,10 @@ void bindGradientStrategies(py::module &mod) {
 /// Can now define its member functions on
 /// that submodule.
 template <typename OptimizerT>
-py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
-  return py::class_<OptimizerT, optimizer>(mod, name.c_str())
-      .def(py::init<>())
+nanobind::class_<OptimizerT, optimizer> addPyOptimizer(nanobind::module_ &mod,
+                                                       std::string &&name) {
+  return nanobind::class_<OptimizerT, optimizer>(mod, name.c_str())
+      .def(nanobind::init<>())
       .def(
           "to_json", [](const OptimizerT &p) { return json(p).dump(); },
           "Convert optimizer to JSON string")
@@ -135,15 +149,15 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
             return p;
           },
           "Convert JSON string to optimizer")
-      .def_readwrite("max_iterations", &OptimizerT::max_eval, R"doc(
+      .def_rw("max_iterations", &OptimizerT::max_eval, R"doc(
           int: Maximum number of optimizer iterations (default: unlimited).
 
           Sets an upper bound on the number of function evaluations or iterations 
           the optimizer will perform. If not set, the optimizer may run until 
           convergence or until another stopping criterion is met.
           )doc")
-      .def_readwrite("initial_parameters", &OptimizerT::initial_parameters,
-                     R"doc(
+      .def_rw("initial_parameters", &OptimizerT::initial_parameters,
+              R"doc(
           list[float]: Initial values for the optimization parameters (optional).
 
           Provides a starting point for the optimization. If not specified, the 
@@ -156,7 +170,7 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
 
                   optimizer.initial_parameters = [0.5, -0.3, 1.2]
           )doc")
-      .def_readwrite("lower_bounds", &OptimizerT::lower_bounds, R"doc(
+      .def_rw("lower_bounds", &OptimizerT::lower_bounds, R"doc(
           list[float]: Lower bounds for optimization parameters (optional).
 
           Constrains the search space by specifying minimum allowed values for 
@@ -168,7 +182,7 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
 
                   optimizer.lower_bounds = [-2.0, -2.0]  # For 2D problem
           )doc")
-      .def_readwrite("upper_bounds", &OptimizerT::upper_bounds, R"doc(
+      .def_rw("upper_bounds", &OptimizerT::upper_bounds, R"doc(
           list[float]: Upper bounds for optimization parameters (optional).
 
           Constrains the search space by specifying maximum allowed values for 
@@ -197,21 +211,22 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
           )doc")
       .def(
           "optimize",
-          [](OptimizerT &opt, const int dim, py::function &func) {
+          [](OptimizerT &opt, const int dim, nanobind::callable &func) {
             return opt.optimize(dim, [&](std::vector<double> x,
                                          std::vector<double> &grad) {
               // Call the function.
               auto ret = func(x);
               // Does it return a tuple?
-              auto isTupleReturn = py::isinstance<py::tuple>(ret);
+              auto isTupleReturn = nanobind::isinstance<nanobind::tuple>(ret);
               // If we don't need gradients, and it does, just grab the value
               // and return.
               if (!opt.requiresGradients() && isTupleReturn)
-                return ret.cast<py::tuple>()[0].cast<double>();
+                return nanobind::cast<double>(
+                    nanobind::cast<nanobind::tuple>(ret)[0]);
               // If we don't need gradients and it doesn't return tuple, then
               // just pass what we got.
               if (!opt.requiresGradients() && !isTupleReturn)
-                return ret.cast<double>();
+                return nanobind::cast<double>(ret);
 
               // Throw an error if we need gradients and they weren't provided.
               if (opt.requiresGradients() && !isTupleReturn)
@@ -220,16 +235,16 @@ py::class_<OptimizerT> addPyOptimizer(py::module &mod, std::string &&name) {
                     "(float, list[float]) for gradient-based optimizers");
 
               // If here, we require gradients, and the signature is right.
-              auto tuple = ret.cast<py::tuple>();
+              auto tuple = nanobind::cast<nanobind::tuple>(ret);
               auto val = tuple[0];
-              auto gradIn = tuple[1].cast<py::list>();
+              auto gradIn = nanobind::cast<nanobind::list>(tuple[1]);
               for (std::size_t i = 0; i < gradIn.size(); i++)
-                grad[i] = gradIn[i].cast<double>();
+                grad[i] = nanobind::cast<double>(gradIn[i]);
 
-              return val.cast<double>();
+              return nanobind::cast<double>(val);
             });
           },
-          py::arg("dimensions"), py::arg("function"), R"doc(
+          nanobind::arg("dimensions"), nanobind::arg("function"), R"doc(
 Run the optimization procedure.
 
 Args:
@@ -267,14 +282,14 @@ Run the optimization procedure.
 )doc");
 }
 
-void bindOptimizers(py::module &mod) {
+void bindOptimizers(nanobind::module_ &mod) {
   // Binding the `cudaq::optimizers` class to `_pycudaq` as a submodule
   // so it's accessible directly in the cudaq namespace.
   auto optimizers_submodule = mod.def_submodule("optimizers");
-  py::class_<optimizer>(optimizers_submodule, "optimizer");
+  nanobind::class_<optimizer>(optimizers_submodule, "optimizer");
 
   addPyOptimizer<optimizers::cobyla>(optimizers_submodule, "COBYLA")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Constrained Optimization BY Linear Approximations (COBYLA).
 
 COBYLA is a gradient-free derivative-free optimization algorithm that uses 
@@ -297,7 +312,7 @@ This optimizer does not require gradients from the objective function.
 )doc");
 
   addPyOptimizer<optimizers::neldermead>(optimizers_submodule, "NelderMead")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Nelder-Mead simplex optimization algorithm.
 
 The Nelder-Mead method is a gradient-free simplex-based optimization algorithm 
@@ -320,7 +335,7 @@ This optimizer does not require gradients from the objective function.
 )doc");
 
   addPyOptimizer<optimizers::lbfgs>(optimizers_submodule, "LBFGS")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) optimizer.
 
 L-BFGS is a quasi-Newton method that approximates the Hessian matrix using 
@@ -346,7 +361,7 @@ This optimizer requires gradients from the objective function.
 
   addPyOptimizer<optimizers::gradient_descent>(optimizers_submodule,
                                                "GradientDescent")
-      .def(py::init<>(), R"doc(
+      .def(nanobind::init<>(), R"doc(
 Basic gradient descent optimization algorithm.
 
 Gradient descent iteratively moves in the direction of steepest descent 
@@ -373,7 +388,7 @@ This optimizer requires gradients from the objective function.
 
   // Have to bind extra optimizer parameters to the following manually:
   auto py_spsa = addPyOptimizer<optimizers::spsa>(optimizers_submodule, "SPSA")
-                     .def(py::init<>(), R"doc(
+                     .def(nanobind::init<>(), R"doc(
 Simultaneous Perturbation Stochastic Approximation (SPSA) optimizer.
 
 SPSA is a gradient-free optimization algorithm that uses simultaneous 
@@ -404,15 +419,15 @@ This optimizer does not require gradients from the objective function.
             function=objective
         )
 )doc");
-  py_spsa.def_readwrite("gamma", &cudaq::optimizers::spsa::gamma, R"doc(
+  py_spsa.def_rw("gamma", &cudaq::optimizers::spsa::gamma, R"doc(
 float: Scaling exponent for the step size schedule (default: 0.101).
 
 Controls how the step size decreases over iterations. The step size at 
 iteration k is proportional to (A + k + 1)^(-gamma), where A is a stability 
 constant. Common values are in the range [0.1, 0.6].
 )doc");
-  py_spsa.def_readwrite("step_size", &cudaq::optimizers::spsa::eval_step_size,
-                        R"doc(
+  py_spsa.def_rw("step_size", &cudaq::optimizers::spsa::eval_step_size,
+                 R"doc(
 float: Evaluation step size for gradient approximation (default: 0.3).
 
 Controls the magnitude of perturbations used to approximate gradients.
@@ -421,7 +436,7 @@ to noise. Typical values range from 0.1 to 0.5.
 )doc");
 
   auto py_adam = addPyOptimizer<optimizers::adam>(optimizers_submodule, "Adam")
-                     .def(py::init<>(), R"doc(
+                     .def(nanobind::init<>(), R"doc(
 Adaptive Moment Estimation (Adam) optimizer.
 
 Adam is an adaptive learning rate optimization algorithm that computes 
@@ -458,8 +473,8 @@ function must return a tuple of (value, gradient_vector).
             function=objective_with_grad
         )
 )doc");
-  py_adam.def_readwrite("batch_size", &cudaq::optimizers::adam::batch_size,
-                        R"doc(
+  py_adam.def_rw("batch_size", &cudaq::optimizers::adam::batch_size,
+                 R"doc(
 int: Number of samples per batch (default: 1).
 
 For stochastic optimization, determines how many samples are used to 
@@ -467,28 +482,28 @@ compute each gradient estimate. Batch size of 1 corresponds to online
 learning. Larger batch sizes can provide more stable gradient estimates
 but require more computation per iteration.
 )doc");
-  py_adam.def_readwrite("beta1", &cudaq::optimizers::adam::beta1, R"doc(
+  py_adam.def_rw("beta1", &cudaq::optimizers::adam::beta1, R"doc(
 float: Exponential decay rate for the first moment estimates (default: 0.9).
 
 Controls the exponential moving average of past gradients (momentum term).
 Values are typically in the range [0.9, 0.999]. Higher values give more 
 weight to past gradients, providing smoother updates but slower adaptation.
 )doc");
-  py_adam.def_readwrite("beta2", &cudaq::optimizers::adam::beta2, R"doc(
+  py_adam.def_rw("beta2", &cudaq::optimizers::adam::beta2, R"doc(
 float: Exponential decay rate for the second moment estimates (default: 0.999).
 
 Controls the exponential moving average of past squared gradients. Values 
 are typically in the range [0.99, 0.9999]. Higher values provide more 
 stable learning rates but slower adaptation to changing gradient magnitudes.
 )doc");
-  py_adam.def_readwrite("epsilon", &cudaq::optimizers::adam::eps, R"doc(
+  py_adam.def_rw("epsilon", &cudaq::optimizers::adam::eps, R"doc(
 float: Small constant for numerical stability (default: 1e-8).
 
 Added to the denominator to prevent division by zero when computing 
 adaptive learning rates. Should be a small positive value, typically 
 between 1e-8 and 1e-6.
 )doc");
-  py_adam.def_readwrite("step_size", &cudaq::optimizers::adam::step_size, R"doc(
+  py_adam.def_rw("step_size", &cudaq::optimizers::adam::step_size, R"doc(
 float: Learning rate (step size) for parameter updates (default: 0.01).
 
 Controls the magnitude of parameter updates at each iteration. Typical 
@@ -496,7 +511,7 @@ values range from 0.001 to 0.1. The effective learning rate is adapted
 per parameter based on gradient history. Start with 0.001 or 0.01 and 
 adjust based on convergence behavior.
 )doc");
-  py_adam.def_readwrite("f_tol", &cudaq::optimizers::adam::f_tol, R"doc(
+  py_adam.def_rw("f_tol", &cudaq::optimizers::adam::f_tol, R"doc(
 float: Convergence tolerance on the objective function value (default: 1e-4).
 
 Optimization terminates when the change in objective function value between 
@@ -505,7 +520,7 @@ convergence but may require more iterations.
 )doc");
 
   auto py_sgd = addPyOptimizer<optimizers::sgd>(optimizers_submodule, "SGD")
-                    .def(py::init<>(), R"doc(
+                    .def(nanobind::init<>(), R"doc(
 Stochastic Gradient Descent (SGD) optimizer.
 
 SGD is a fundamental optimization algorithm that updates parameters by taking 
@@ -539,7 +554,7 @@ function must return a tuple of (value, gradient_vector).
             function=objective_with_grad
         )
 )doc");
-  py_sgd.def_readwrite("batch_size", &cudaq::optimizers::sgd::batch_size, R"doc(
+  py_sgd.def_rw("batch_size", &cudaq::optimizers::sgd::batch_size, R"doc(
 int: Number of samples per batch (default: 1).
 
 For stochastic optimization, determines how many samples are used to 
@@ -548,7 +563,7 @@ stochastic gradient descent. Larger batch sizes (mini-batch SGD) can
 provide more stable gradient estimates but require more computation 
 per iteration.
 )doc");
-  py_sgd.def_readwrite("step_size", &cudaq::optimizers::sgd::step_size, R"doc(
+  py_sgd.def_rw("step_size", &cudaq::optimizers::sgd::step_size, R"doc(
 float: Learning rate (step size) for parameter updates (default: 0.01).
 
 Controls the magnitude of parameter updates at each iteration. The update 
@@ -556,7 +571,7 @@ rule is: x_new = x_old - step_size * gradient. Typical values range from
 0.001 to 0.1. Too large values can cause divergence, while too small values 
 lead to slow convergence.
 )doc");
-  py_sgd.def_readwrite("f_tol", &cudaq::optimizers::sgd::f_tol, R"doc(
+  py_sgd.def_rw("f_tol", &cudaq::optimizers::sgd::f_tol, R"doc(
 float: Convergence tolerance on the objective function value (default: 1e-4).
 
 Optimization terminates when the change in objective function value between 
@@ -566,7 +581,7 @@ gradients, convergence may be noisy.
 )doc");
 }
 
-void bindOptimizerWrapper(py::module &mod) {
+void bindOptimizerWrapper(nanobind::module_ &mod) {
   bindOptimizationResult(mod);
   bindGradientStrategies(mod);
   bindOptimizers(mod);
diff --git a/python/runtime/cudaq/algorithms/py_optimizer.h b/python/runtime/cudaq/algorithms/py_optimizer.h
index bd90e44e3af..10ec35d46cd 100644
--- a/python/runtime/cudaq/algorithms/py_optimizer.h
+++ b/python/runtime/cudaq/algorithms/py_optimizer.h
@@ -8,11 +8,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the `cudaq::optimizers::` to python.
-void bindOptimizerWrapper(py::module &mod);
+void bindOptimizerWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_resource_count.cpp b/python/runtime/cudaq/algorithms/py_resource_count.cpp
index 2f30d7d6b87..53af2405cf5 100644
--- a/python/runtime/cudaq/algorithms/py_resource_count.cpp
+++ b/python/runtime/cudaq/algorithms/py_resource_count.cpp
@@ -10,17 +10,16 @@
 #include "common/Resources.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 #include "utils/LinkedLibraryHolder.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-#include <pybind11/functional.h>
-
-namespace py = pybind11;
+#include "utils/NanobindAdaptors.h"
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
 
 using namespace cudaq;
 
 static Resources
 estimate_resources_impl(const std::string &kernelName, MlirModule kernelMod,
                         std::optional<std::function<bool()>> choice,
-                        py::args args) {
+                        nanobind::args args) {
   auto &platform = cudaq::get_platform();
   args = simplifiedValidateInputArguments(args);
 
@@ -60,7 +59,7 @@ estimate_resources_impl(const std::string &kernelName, MlirModule kernelMod,
   return counts;
 }
 
-void cudaq::bindCountResources(py::module &mod) {
+void cudaq::bindCountResources(nanobind::module_ &mod) {
   mod.def("estimate_resources_impl", estimate_resources_impl,
           "See python documentation for estimate_resources.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_resource_count.h b/python/runtime/cudaq/algorithms/py_resource_count.h
index af03edd8476..d307c83ed9c 100644
--- a/python/runtime/cudaq/algorithms/py_resource_count.h
+++ b/python/runtime/cudaq/algorithms/py_resource_count.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindCountResources(pybind11::module &mod);
+void bindCountResources(nanobind::module_ &mod);
 }
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index 153d9c50fbc..5609ebe325a 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -11,26 +11,27 @@
 #include "cudaq/algorithms/run.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include <future>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 using namespace cudaq_internal::compiler;
 
-static std::vector<py::object> readRunResults(mlir::ModuleOp module,
-                                              mlir::Type ty,
-                                              details::RunResultSpan &results,
-                                              std::size_t count) {
-  std::vector<py::object> ret;
+static std::vector<nanobind::object>
+readRunResults(mlir::ModuleOp module, mlir::Type ty,
+               details::RunResultSpan &results, std::size_t count) {
+  std::vector<nanobind::object> ret;
   std::size_t byteSize = results.lengthInBytes / count;
   for (std::size_t i = 0; i < results.lengthInBytes; i += byteSize) {
-    py::object obj = convertResult(module, ty, results.data + i);
+    nanobind::object obj = convertResult(module, ty, results.data + i);
     ret.push_back(obj);
   }
   return ret;
@@ -90,19 +91,18 @@ pyRunTheKernel(const std::string &name, quantum_platform &platform,
   return results;
 }
 
-static std::vector<py::object> pyReadResults(details::RunResultSpan results,
-                                             mlir::ModuleOp mod,
-                                             std::size_t shots_count,
-                                             const std::string &name) {
+static std::vector<nanobind::object>
+pyReadResults(details::RunResultSpan results, mlir::ModuleOp mod,
+              std::size_t shots_count, const std::string &name) {
   auto returnTy = recoverReturnType(mod, name);
   return readRunResults(mod, returnTy, results, shots_count);
 }
 
 /// @brief Run `cudaq::run` on the provided kernel.
-static std::vector<py::object>
+static std::vector<nanobind::object>
 run_impl(const std::string &shortName, MlirModule module,
          std::size_t shots_count, std::optional<noise_model> noise_model,
-         std::size_t qpu_id, py::args runtimeArgs) {
+         std::size_t qpu_id, nanobind::args runtimeArgs) {
   if (shots_count == 0)
     return {};
 
@@ -133,7 +133,7 @@ namespace {
 // When the `ready` future is set, the content of the buffer is filled.
 struct async_run_result {
   std::future<void> ready;
-  std::vector<py::object> *results;
+  std::vector<nanobind::object> *results;
   std::string *error;
 };
 } // namespace
@@ -142,7 +142,7 @@ struct async_run_result {
 static async_run_result
 run_async_impl(const std::string &shortName, MlirModule module,
                std::size_t shots_count, std::optional<noise_model> noise_model,
-               std::size_t qpu_id, py::args runtimeArgs) {
+               std::size_t qpu_id, nanobind::args runtimeArgs) {
   if (!shots_count)
     return {};
 
@@ -162,7 +162,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
         "Noise model is not supported on remote platforms.");
 
   async_run_result result;
-  result.results = new std::vector<py::object>();
+  result.results = new std::vector<nanobind::object>();
   result.error = new std::string();
 
   if (shots_count == 0) {
@@ -184,7 +184,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
   {
     // Release GIL to allow c++ threads, all code inside the scope is c++, so
     // there is no need to re-acquire the GIL inside the thread.
-    py::gil_scoped_release gil_release{};
+    nanobind::gil_scoped_release gil_release{};
     QuantumTask wrapped = detail::make_copyable_function(
         [sp = std::move(spanPromise), ep = std::move(errorPromise),
          noise_model = std::move(noise_model), qpu_id, name = shortName,
@@ -214,7 +214,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
   {
     // Release GIL to allow c++ threads, re-acquire for conversion of the
     // results to python objects.
-    py::gil_scoped_release gil_release{};
+    nanobind::gil_scoped_release gil_release{};
     auto resultFuture =
         std::async(std::launch::deferred,
                    [sf = std::move(spanFuture), ef = std::move(errorFuture),
@@ -224,7 +224,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
                      std::swap(*errorPtr, error);
                      if (error.empty()) {
                        auto span = sf.get();
-                       py::gil_scoped_acquire gil{};
+                       nanobind::gil_scoped_acquire gil{};
                        auto results =
                            pyReadResults(span, mod, shots_count, shortName);
                        std::swap(*resultsPtr, results);
@@ -237,7 +237,7 @@ run_async_impl(const std::string &shortName, MlirModule module,
 }
 
 /// @brief Bind the run cudaq function.
-void cudaq::bindPyRun(py::module &mod) {
+void cudaq::bindPyRun(nanobind::module_ &mod) {
   mod.def("run_impl", run_impl,
           R"#(
 Run the provided `kernel` with the given kernel arguments over the specified
@@ -255,8 +255,8 @@ number of circuit executions (`shots_count`).
 }
 
 /// @brief Bind the run_async cudaq function.
-void cudaq::bindPyRunAsync(py::module &mod) {
-  py::class_<async_run_result>(mod, "AsyncRunResultImpl", "")
+void cudaq::bindPyRunAsync(nanobind::module_ &mod) {
+  nanobind::class_<async_run_result>(mod, "AsyncRunResultImpl", "")
       .def(
           "get",
           [](async_run_result &self) {
diff --git a/python/runtime/cudaq/algorithms/py_run.h b/python/runtime/cudaq/algorithms/py_run.h
index 3e2c10df6b4..c1070f1f443 100644
--- a/python/runtime/cudaq/algorithms/py_run.h
+++ b/python/runtime/cudaq/algorithms/py_run.h
@@ -8,9 +8,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyRun(pybind11::module &mod);
-void bindPyRunAsync(pybind11::module &mod);
+void bindPyRun(nanobind::module_ &mod);
+void bindPyRunAsync(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_sample_async.cpp b/python/runtime/cudaq/algorithms/py_sample_async.cpp
index dfe50fcb4b5..43deba6c1ce 100644
--- a/python/runtime/cudaq/algorithms/py_sample_async.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_async.cpp
@@ -10,21 +10,21 @@
 #include "common/DeviceCodeRegistry.h"
 #include "cudaq/algorithms/sample.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include <fmt/core.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
 static async_sample_result sample_async_impl(
     const std::string &shortName, MlirModule module, std::size_t shots_count,
     std::optional<noise_model> noise_model, bool explicit_measurements,
-    std::size_t qpu_id, py::args runtimeArgs) {
+    std::size_t qpu_id, nanobind::args runtimeArgs) {
   mlir::ModuleOp mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
 
@@ -40,7 +40,7 @@ static async_sample_result sample_async_impl(
   auto opaques = marshal_arguments_for_module_launch(mod, runtimeArgs, fnOp);
 
   // Should only have C++ going on here, safe to release the GIL
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
 
   // Use runSamplingAsync with noise model support.
   // The noise_model is passed by value to runSamplingAsync, which captures
@@ -60,7 +60,7 @@ static async_sample_result sample_async_impl(
       std::move(noise_model));
 }
 
-void cudaq::bindSampleAsync(py::module &mod) {
+void cudaq::bindSampleAsync(nanobind::module_ &mod) {
   // Async. result wrapper for Python kernels, which also holds the Python MLIR
   // context.
   //
@@ -74,8 +74,8 @@ void cudaq::bindSampleAsync(py::module &mod) {
   // then track a reference (ref count) to the context of the temporary (rval)
   // kernel.
 
-  py::class_<async_sample_result>(mod, "AsyncSampleResultImpl",
-                                  R"#(
+  nanobind::class_<async_sample_result>(mod, "AsyncSampleResultImpl",
+                                        R"#(
 A data-type containing the results of a call to :func:`sample_async`.  The
 `AsyncSampleResult` models a future-like type, whose :class:`SampleResult` may
 be returned via an invocation of the `get` method.  This kicks off a wait on the
@@ -83,14 +83,15 @@ current thread until the results are available.  See `future
 <https://en.cppreference.com/w/cpp/thread/future>`_ for more information on this
 programming pattern.
 )#")
-      .def(py::init([](std::string inJson) {
-        async_sample_result f;
-        std::istringstream is(inJson);
-        is >> f;
-        return f;
-      }))
+      .def("__init__",
+           [](async_sample_result *self, std::string inJson) {
+             async_sample_result f;
+             std::istringstream is(inJson);
+             is >> f;
+             new (self) async_sample_result(std::move(f));
+           })
       .def("get", &async_sample_result::get,
-           py::call_guard<py::gil_scoped_release>(),
+           nanobind::call_guard<nanobind::gil_scoped_release>(),
            "Return the :class:`SampleResult` from the asynchronous sample "
            "execution.\n")
       .def(
diff --git a/python/runtime/cudaq/algorithms/py_sample_async.h b/python/runtime/cudaq/algorithms/py_sample_async.h
index b494b2631bb..ec1c69476ac 100644
--- a/python/runtime/cudaq/algorithms/py_sample_async.h
+++ b/python/runtime/cudaq/algorithms/py_sample_async.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindSampleAsync(py::module &mod);
+void bindSampleAsync(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
index 4bf979ac8d9..064672787bc 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
@@ -20,13 +20,15 @@
 #include "cudaq/ptsbe/strategies/OrderedSamplingStrategy.h"
 #include "cudaq/ptsbe/strategies/ProbabilisticSamplingStrategy.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
@@ -38,9 +40,9 @@ static ptsbe::sample_result
 pySamplePTSBE(const std::string &shortName, MlirModule module,
               std::size_t shots_count, noise_model noiseModel,
               std::optional<std::size_t> max_trajectories,
-              py::object sampling_strategy, py::object shot_allocation_obj,
-              bool return_execution_data, bool include_sequential_data,
-              py::args runtimeArgs) {
+              nanobind::object sampling_strategy,
+              nanobind::object shot_allocation_obj, bool return_execution_data,
+              bool include_sequential_data, nanobind::args runtimeArgs) {
   if (shots_count == 0)
     return ptsbe::sample_result();
 
@@ -51,11 +53,12 @@ pySamplePTSBE(const std::string &shortName, MlirModule module,
 
   if (!sampling_strategy.is_none())
     ptsbe_options.strategy =
-        sampling_strategy.cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>();
+        nanobind::cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
+            sampling_strategy);
 
   if (!shot_allocation_obj.is_none())
     ptsbe_options.shot_allocation =
-        shot_allocation_obj.cast<ptsbe::ShotAllocationStrategy>();
+        nanobind::cast<ptsbe::ShotAllocationStrategy>(shot_allocation_obj);
 
   auto mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
@@ -104,13 +107,12 @@ struct AsyncPTSBESampleResultImpl {
 } // namespace
 
 /// @brief Run PTSBE sampling asynchronously from Python.
-static AsyncPTSBESampleResultImpl
-pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
-                   std::size_t shots_count, noise_model &noiseModel,
-                   std::optional<std::size_t> max_trajectories,
-                   py::object sampling_strategy, py::object shot_allocation_obj,
-                   bool return_execution_data, bool include_sequential_data,
-                   py::args runtimeArgs) {
+static AsyncPTSBESampleResultImpl pySampleAsyncPTSBE(
+    const std::string &shortName, MlirModule module, std::size_t shots_count,
+    noise_model &noiseModel, std::optional<std::size_t> max_trajectories,
+    nanobind::object sampling_strategy, nanobind::object shot_allocation_obj,
+    bool return_execution_data, bool include_sequential_data,
+    nanobind::args runtimeArgs) {
 
   ptsbe::PTSBEOptions ptsbe_options;
   ptsbe_options.return_execution_data = return_execution_data;
@@ -119,11 +121,12 @@ pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
 
   if (!sampling_strategy.is_none())
     ptsbe_options.strategy =
-        sampling_strategy.cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>();
+        nanobind::cast<std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
+            sampling_strategy);
 
   if (!shot_allocation_obj.is_none())
     ptsbe_options.shot_allocation =
-        shot_allocation_obj.cast<ptsbe::ShotAllocationStrategy>();
+        nanobind::cast<ptsbe::ShotAllocationStrategy>(shot_allocation_obj);
 
   auto mod = unwrap(module);
   runtimeArgs = simplifiedValidateInputArguments(runtimeArgs);
@@ -135,7 +138,7 @@ pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
   std::string kernelName = shortName;
 
   // Release GIL before launching async C++ work
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return AsyncPTSBESampleResultImpl(ptsbe::detail::runSamplingAsyncPTSBE(
       [opaques = std::move(opaques), kernelName, mod = mod.clone()]() mutable {
         [[maybe_unused]] auto result =
@@ -145,20 +148,19 @@ pySampleAsyncPTSBE(const std::string &shortName, MlirModule module,
       noiseModel));
 }
 
-void cudaq::bindSamplePTSBE(py::module &mod) {
+void cudaq::bindSamplePTSBE(nanobind::module_ &mod) {
   auto ptsbe = mod.def_submodule(
       "ptsbe", "PTSBE (Pre-Trajectory Sampling with Batch Execution)");
 
   // Base strategy class (abstract, not directly constructible)
-  py::class_<ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::PTSSamplingStrategy>>(
+  nanobind::class_<ptsbe::PTSSamplingStrategy>(
       ptsbe, "PTSSamplingStrategy",
       "Base class for trajectory sampling strategies.")
       .def("name", &ptsbe::PTSSamplingStrategy::name,
            "Get the name of this strategy.");
 
   // Shot allocation strategy
-  py::enum_<ptsbe::ShotAllocationStrategy::Type>(
+  nanobind::enum_<ptsbe::ShotAllocationStrategy::Type>(
       ptsbe, "ShotAllocationType",
       "Strategy type for allocating shots across trajectories.")
       .value("PROPORTIONAL", ptsbe::ShotAllocationStrategy::Type::PROPORTIONAL,
@@ -172,33 +174,36 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
              ptsbe::ShotAllocationStrategy::Type::HIGH_WEIGHT_BIAS,
              "Bias toward high-weight error trajectories.");
 
-  py::class_<ptsbe::ShotAllocationStrategy>(
+  nanobind::class_<ptsbe::ShotAllocationStrategy>(
       ptsbe, "ShotAllocationStrategy",
       "Strategy for allocating shots across selected trajectories.")
-      .def(py::init<>(), "Create a default (PROPORTIONAL) strategy.")
-      .def(py::init([](ptsbe::ShotAllocationStrategy::Type t, double bias,
-                       std::optional<std::uint64_t> seed) {
-             return ptsbe::ShotAllocationStrategy(t, bias, seed);
-           }),
-           py::arg("type"), py::arg("bias_strength") = 2.0,
-           py::arg("seed") = py::none(),
-           "Create a strategy with specified type, optional bias strength, "
-           "and optional random seed. When seed is None (default), uses "
-           "CUDA-Q's global random seed.")
-      .def_readwrite("type", &ptsbe::ShotAllocationStrategy::type,
-                     "The allocation strategy type.")
-      .def_readwrite(
-          "bias_strength", &ptsbe::ShotAllocationStrategy::bias_strength,
-          "Bias factor for weighted strategies. Default value is 2.0.");
+      .def(nanobind::init<>(), "Create a default (PROPORTIONAL) strategy.")
+      .def(
+          "__init__",
+          [](ptsbe::ShotAllocationStrategy *self,
+             ptsbe::ShotAllocationStrategy::Type t, double bias,
+             std::optional<std::uint64_t> seed) {
+            new (self) ptsbe::ShotAllocationStrategy(t, bias, seed);
+          },
+          nanobind::arg("type"), nanobind::arg("bias_strength") = 2.0,
+          nanobind::arg("seed") = nanobind::none(),
+          "Create a strategy with specified type, optional bias strength, "
+          "and optional random seed. When seed is None (default), uses "
+          "CUDA-Q's global random seed.")
+      .def_rw("type", &ptsbe::ShotAllocationStrategy::type,
+              "The allocation strategy type.")
+      .def_rw("bias_strength", &ptsbe::ShotAllocationStrategy::bias_strength,
+              "Bias factor for weighted strategies. Default value is 2.0.");
 
   // Concrete strategies
-  py::class_<ptsbe::ProbabilisticSamplingStrategy, ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::ProbabilisticSamplingStrategy>>(
+  nanobind::class_<ptsbe::ProbabilisticSamplingStrategy,
+                   ptsbe::PTSSamplingStrategy>(
       ptsbe, "ProbabilisticSamplingStrategy",
       "Sample trajectories randomly based on their occurrence probabilities.")
-      .def(py::init<std::optional<std::uint64_t>, std::optional<std::size_t>>(),
-           py::arg("seed") = py::none(),
-           py::arg("max_trajectory_samples") = py::none(),
+      .def(nanobind::init<std::optional<std::uint64_t>,
+                          std::optional<std::size_t>>(),
+           nanobind::arg("seed") = nanobind::none(),
+           nanobind::arg("max_trajectory_samples") = nanobind::none(),
            "Create a probabilistic strategy with optional random seed and "
            "max trajectory sample count. When seed is None (default), uses "
            "CUDA-Q's global random seed. "
@@ -206,20 +211,19 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
            "The loop stops early once max_trajectories unique patterns are "
            "found. When None (default), a budget is auto-calculated.");
 
-  py::class_<ptsbe::OrderedSamplingStrategy, ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::OrderedSamplingStrategy>>(
+  nanobind::class_<ptsbe::OrderedSamplingStrategy, ptsbe::PTSSamplingStrategy>(
       ptsbe, "OrderedSamplingStrategy",
       "Sample trajectories sorted by probability in descending order.")
-      .def(py::init<>(), "Create an ordered strategy.");
+      .def(nanobind::init<>(), "Create an ordered strategy.");
 
-  py::class_<ptsbe::ExhaustiveSamplingStrategy, ptsbe::PTSSamplingStrategy,
-             std::shared_ptr<ptsbe::ExhaustiveSamplingStrategy>>(
+  nanobind::class_<ptsbe::ExhaustiveSamplingStrategy,
+                   ptsbe::PTSSamplingStrategy>(
       ptsbe, "ExhaustiveSamplingStrategy",
       "Enumerate all possible trajectories in lexicographic order.")
-      .def(py::init<>(), "Create an exhaustive strategy.");
+      .def(nanobind::init<>(), "Create an exhaustive strategy.");
 
   // Trace instruction type enum
-  py::enum_<ptsbe::TraceInstructionType>(
+  nanobind::enum_<ptsbe::TraceInstructionType>(
       ptsbe, "TraceInstructionType",
       "Type discriminator for trace instructions.")
       .value("Gate", ptsbe::TraceInstructionType::Gate)
@@ -228,47 +232,48 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       .export_values();
 
   // Trace instruction
-  py::class_<ptsbe::TraceInstruction>(
+  nanobind::class_<ptsbe::TraceInstruction>(
       ptsbe, "TraceInstruction", "Single operation in the execution trace.")
-      .def_property_readonly(
+      .def_prop_ro(
           "type", [](const ptsbe::TraceInstruction &self) { return self.type; })
-      .def_property_readonly(
+      .def_prop_ro(
           "name", [](const ptsbe::TraceInstruction &self) { return self.name; })
-      .def_property_readonly("targets",
-                             [](const ptsbe::TraceInstruction &self) {
-                               return std::vector<std::size_t>(
-                                   self.targets.begin(), self.targets.end());
-                             })
-      .def_property_readonly("controls",
-                             [](const ptsbe::TraceInstruction &self) {
-                               return std::vector<std::size_t>(
-                                   self.controls.begin(), self.controls.end());
-                             })
-      .def_property_readonly("params",
-                             [](const ptsbe::TraceInstruction &self) {
-                               return std::vector<double>(self.params.begin(),
-                                                          self.params.end());
-                             })
+      .def_prop_ro("targets",
+                   [](const ptsbe::TraceInstruction &self) {
+                     return std::vector<std::size_t>(self.targets.begin(),
+                                                     self.targets.end());
+                   })
+      .def_prop_ro("controls",
+                   [](const ptsbe::TraceInstruction &self) {
+                     return std::vector<std::size_t>(self.controls.begin(),
+                                                     self.controls.end());
+                   })
+      .def_prop_ro("params",
+                   [](const ptsbe::TraceInstruction &self) {
+                     return std::vector<double>(self.params.begin(),
+                                                self.params.end());
+                   })
       .def("__repr__", [](const ptsbe::TraceInstruction &self) {
         return "TraceInstruction(" + self.name + " on " +
                std::to_string(self.targets.size()) + " qubits)";
       });
 
   // Kraus selection (cudaq:: namespace)
-  py::class_<KrausSelection>(ptsbe, "KrausSelection",
-                             "Reference to a single Kraus operator selection.")
-      .def_property_readonly(
+  nanobind::class_<KrausSelection>(
+      ptsbe, "KrausSelection",
+      "Reference to a single Kraus operator selection.")
+      .def_prop_ro(
           "circuit_location",
           [](const KrausSelection &self) { return self.circuit_location; })
-      .def_property_readonly(
+      .def_prop_ro(
           "kraus_operator_index",
           [](const KrausSelection &self) { return self.kraus_operator_index; })
-      .def_property_readonly(
-          "is_error", [](const KrausSelection &self) { return self.is_error; })
-      .def_property_readonly(
-          "qubits", [](const KrausSelection &self) { return self.qubits; })
-      .def_property_readonly(
-          "op_name", [](const KrausSelection &self) { return self.op_name; })
+      .def_prop_ro("is_error",
+                   [](const KrausSelection &self) { return self.is_error; })
+      .def_prop_ro("qubits",
+                   [](const KrausSelection &self) { return self.qubits; })
+      .def_prop_ro("op_name",
+                   [](const KrausSelection &self) { return self.op_name; })
       .def("__repr__", [](const KrausSelection &self) {
         return "KrausSelection(loc=" + std::to_string(self.circuit_location) +
                ", idx=" + std::to_string(self.kraus_operator_index) +
@@ -276,27 +281,25 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       });
 
   // Kraus trajectory (cudaq:: namespace)
-  py::class_<KrausTrajectory>(
+  nanobind::class_<KrausTrajectory>(
       ptsbe, "KrausTrajectory",
       "Complete specification of one noise trajectory with outcomes.")
-      .def_property_readonly(
+      .def_prop_ro(
           "trajectory_id",
           [](const KrausTrajectory &self) { return self.trajectory_id; })
-      .def_property_readonly(
-          "probability",
-          [](const KrausTrajectory &self) { return self.probability; })
-      .def_property_readonly(
-          "num_shots",
-          [](const KrausTrajectory &self) { return self.num_shots; })
-      .def_readonly("multiplicity", &KrausTrajectory::multiplicity,
-                    "Number of times this trajectory was sampled.")
-      .def_readonly("weight", &KrausTrajectory::weight,
-                    "Allocation weight for shot distribution.")
-      .def_property_readonly(
+      .def_prop_ro("probability",
+                   [](const KrausTrajectory &self) { return self.probability; })
+      .def_prop_ro("num_shots",
+                   [](const KrausTrajectory &self) { return self.num_shots; })
+      .def_ro("multiplicity", &KrausTrajectory::multiplicity,
+              "Number of times this trajectory was sampled.")
+      .def_ro("weight", &KrausTrajectory::weight,
+              "Allocation weight for shot distribution.")
+      .def_prop_ro(
           "kraus_selections",
           [](const KrausTrajectory &self) { return self.kraus_selections; },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
+          nanobind::rv_policy::reference_internal)
+      .def_prop_ro(
           "measurement_counts",
           [](const KrausTrajectory &self) { return self.measurement_counts; })
       .def("__repr__", [](const KrausTrajectory &self) {
@@ -306,34 +309,35 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       });
 
   // PTSBE execution data container
-  py::class_<ptsbe::PTSBEExecutionData>(
+  nanobind::class_<ptsbe::PTSBEExecutionData>(
       ptsbe, "PTSBEExecutionData",
       "Container for PTSBE execution data including circuit structure, "
       "trajectory specifications, and per-trajectory measurement outcomes.")
-      .def_property_readonly(
+      .def_prop_ro(
           "instructions",
           [](const ptsbe::PTSBEExecutionData &self)
               -> const std::vector<ptsbe::TraceInstruction> & {
             return self.instructions;
           },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
+          nanobind::rv_policy::reference_internal)
+      .def_prop_ro(
           "trajectories",
           [](const ptsbe::PTSBEExecutionData &self)
               -> const std::vector<cudaq::KrausTrajectory> & {
             return self.trajectories;
           },
-          py::return_value_policy::reference_internal)
+          nanobind::rv_policy::reference_internal)
       .def(
           "count_instructions",
           [](const ptsbe::PTSBEExecutionData &self,
-             ptsbe::TraceInstructionType type, py::object name) -> std::size_t {
+             ptsbe::TraceInstructionType type,
+             nanobind::object name) -> std::size_t {
             std::optional<std::string> nameOpt;
             if (!name.is_none())
-              nameOpt = name.cast<std::string>();
+              nameOpt = nanobind::cast<std::string>(name);
             return self.count_instructions(type, nameOpt);
           },
-          py::arg("type"), py::arg("name") = py::none(),
+          nanobind::arg("type"), nanobind::arg("name") = nanobind::none(),
           "Count instructions of a given type.")
       .def(
           "get_trajectory",
@@ -344,7 +348,8 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
               return nullptr;
             return &result.value().get();
           },
-          py::return_value_policy::reference_internal, py::arg("trajectory_id"),
+          nanobind::rv_policy::reference_internal,
+          nanobind::arg("trajectory_id"),
           "Look up a trajectory by its ID. Returns None if not found.")
       .def("__repr__",
            [](const ptsbe::PTSBEExecutionData &self) {
@@ -358,10 +363,10 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
       });
 
   // PTSBE sample result (subclass of sample_result)
-  py::class_<ptsbe::sample_result, sample_result>(
+  nanobind::class_<ptsbe::sample_result, sample_result>(
       ptsbe, "PTSBESampleResult",
       "PTSBE sample result with optional execution data.")
-      .def_property_readonly(
+      .def_prop_ro(
           "ptsbe_execution_data",
           [](const ptsbe::sample_result &self)
               -> const ptsbe::PTSBEExecutionData * {
@@ -371,31 +376,36 @@ void cudaq::bindSamplePTSBE(py::module &mod) {
           },
           // reference_internal ties the returned object's lifetime to self,
           // so the pointer into internal data stays valid.
-          py::return_value_policy::reference_internal,
+          nanobind::rv_policy::reference_internal,
           "PTSBE execution data if return_execution_data was True, None "
           "otherwise.")
       .def("has_execution_data", &ptsbe::sample_result::has_execution_data,
            "Check if execution data is available.");
 
   // Async PTSBE sample result wrapper
-  py::class_<AsyncPTSBESampleResultImpl>(
+  nanobind::class_<AsyncPTSBESampleResultImpl>(
       ptsbe, "AsyncSampleResultImpl",
       "Future-like wrapper for asynchronous PTSBE sampling.")
       .def("get", &AsyncPTSBESampleResultImpl::get,
-           py::call_guard<py::gil_scoped_release>(),
+           nanobind::call_guard<nanobind::gil_scoped_release>(),
            "Block until the PTSBE sampling result is available and return it.");
 
   // PTSBE sample implementation
-  ptsbe.def("sample_impl", pySamplePTSBE,
+  ptsbe.def("sample_impl", pySamplePTSBE, nanobind::arg("kernel_name"),
+            nanobind::arg("module"), nanobind::arg("shots_count"),
+            nanobind::arg("noise_model"), nanobind::arg("max_trajectories"),
+            nanobind::arg("sampling_strategy").none(),
+            nanobind::arg("shot_allocation").none(),
+            nanobind::arg("return_execution_data"),
+            nanobind::arg("include_sequential_data"),
             R"pbdoc(
 Run PTSBE sampling on the provided kernel.
 
 Args:
   kernel_name: The kernel name.
   module: The MLIR module.
-  return_type: The MLIR return type.
   shots_count: The number of shots.
-  noise_model: Optional noise model for gate-based noise; may be None.
+  noise_model: The noise model.
   max_trajectories: Maximum unique trajectories, or None to use shots.
   sampling_strategy: Sampling strategy or None for default (probabilistic).
   shot_allocation: Shot allocation strategy or None for default (proportional).
@@ -409,6 +419,13 @@ Run PTSBE sampling on the provided kernel.
 
   // PTSBE async sample implementation
   ptsbe.def("sample_async_impl", pySampleAsyncPTSBE,
+            nanobind::arg("kernel_name"), nanobind::arg("module"),
+            nanobind::arg("shots_count"), nanobind::arg("noise_model"),
+            nanobind::arg("max_trajectories"),
+            nanobind::arg("sampling_strategy").none(),
+            nanobind::arg("shot_allocation").none(),
+            nanobind::arg("return_execution_data"),
+            nanobind::arg("include_sequential_data"),
             "Run PTSBE sampling asynchronously. Returns an "
             "AsyncSampleResultImpl.");
 }
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.h b/python/runtime/cudaq/algorithms/py_sample_ptsbe.h
index 2c5f2869486..ad8386efd64 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.h
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindSamplePTSBE(pybind11::module &mod);
+void bindSamplePTSBE(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index 38fa842f020..a1ff9c2cd02 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -13,8 +13,16 @@
 #include "cudaq/algorithms/get_state.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
 
 using namespace cudaq;
 
@@ -41,7 +49,7 @@ static std::vector<int> bitStringToIntVec(const std::string &bitString) {
 
 /// @brief Run `cudaq::get_state` on the provided kernel and spin operator.
 static state get_state_impl(const std::string &shortName, MlirModule mod,
-                            py::args args) {
+                            nanobind::args args) {
   auto closure = [=]() {
     return marshal_and_launch_module(shortName, mod, args);
   };
@@ -51,7 +59,7 @@ static state get_state_impl(const std::string &shortName, MlirModule mod,
 static std::future<state> get_state_async_impl(const std::string &shortName,
                                                MlirModule module,
                                                std::size_t qpu_id,
-                                               py::args args) {
+                                               nanobind::args args) {
   // Launch the asynchronous execution.
   auto mod = unwrap(module);
   std::string kernelName = shortName;
@@ -59,7 +67,7 @@ static std::future<state> get_state_async_impl(const std::string &shortName,
   auto fnOp = getKernelFuncOp(mod, shortName);
   auto opaques = marshal_arguments_for_module_launch(mod, args, fnOp);
 
-  py::gil_scoped_release release;
+  nanobind::gil_scoped_release release;
   return details::runGetStateAsync(
       detail::make_copyable_function([opaques = std::move(opaques), kernelName,
                                       mod = mod.clone()]() mutable {
@@ -131,12 +139,12 @@ class PyRemoteSimulationState : public RemoteSimulationState {
 
 /// @brief Run `cudaq::get_state` for remote execution targets on the provided
 /// kernel and args
-state pyGetStateRemote(py::object kernel, py::args args) {
-  if (py::hasattr(kernel, "compile"))
+state pyGetStateRemote(nanobind::object kernel, nanobind::args args) {
+  if (nanobind::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  auto kernelName = kernel.attr("uniqName").cast<std::string>();
-  auto kernelMod = kernel.attr("qkeModule").cast<MlirModule>();
+  auto kernelName = nanobind::cast<std::string>(kernel.attr("uniqName"));
+  auto kernelMod = nanobind::cast<MlirModule>(kernel.attr("qkeModule"));
   args = simplifiedValidateInputArguments(args);
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
 #if 0
@@ -170,7 +178,7 @@ class PyQPUState : public QPUState {
 /// @brief Run `cudaq::get_state` for qpu targets on the provided
 /// kernel and args
 state pyGetStateQPU(const std::string &kernelName, MlirModule kernelMod,
-                    py::args args) {
+                    nanobind::args args) {
   auto moduleOp = unwrap(kernelMod);
   std::string mlirCode;
   llvm::raw_string_ostream outStr(mlirCode);
@@ -182,45 +190,62 @@ state pyGetStateQPU(const std::string &kernelName, MlirModule kernelMod,
   return state(new PyQPUState(kernelName, mlirCode, argData));
 }
 
-state pyGetStateLibraryMode(py::object kernel, py::args args) {
+state pyGetStateLibraryMode(nanobind::object kernel, nanobind::args args) {
   return details::extractState([&]() mutable {
     if (0 == args.size())
       kernel();
     else {
-      std::vector<py::object> argsData;
+      std::vector<nanobind::object> argsData;
       for (size_t i = 0; i < args.size(); i++) {
-        py::object arg = args[i];
-        argsData.emplace_back(std::forward<py::object>(arg));
+        nanobind::object arg = args[i];
+        argsData.emplace_back(std::forward<nanobind::object>(arg));
       }
       kernel(std::move(argsData));
     }
   });
 }
 
-static py::buffer_info getCupyBufferInfo(py::buffer cupy_buffer) {
-  // Note: cupy 13.5+ arrays will bind (overload resolution) to a py::buffer
-  // type. However, we cannot access the underlying buffer info via a
+/// @brief Helper struct to hold buffer metadata, analogous to Python's
+/// buffer_info.
+struct BufferInfo {
+  void *ptr = nullptr;
+  std::size_t itemsize = 0;
+  std::string format;
+  std::size_t ndim = 0;
+  std::vector<std::size_t> shape;
+  std::vector<ssize_t> strides;
+  bool readonly = false;
+  std::size_t size = 0; // total number of elements
+};
+
+static BufferInfo getCupyBufferInfo(nanobind::object cupy_buffer) {
+  // Note: cupy 13.5+ arrays will bind (overload resolution) to a
+  // nanobind::object type. However, we cannot access the underlying buffer info
+  // via a
   // `.request()` as it will throw unless that is managed memory. Here, we
-  // retrieve and construct buffer_info from the CuPy array interface.
+  // retrieve and construct BufferInfo from the CuPy array interface.
 
-  if (!py::hasattr(cupy_buffer, "__cuda_array_interface__")) {
+  if (!nanobind::hasattr(cupy_buffer, "__cuda_array_interface__")) {
     throw std::runtime_error("Buffer is not a CuPy array");
   }
 
-  py::dict cupy_array_info = cupy_buffer.attr("__cuda_array_interface__");
+  nanobind::dict cupy_array_info = nanobind::cast<nanobind::dict>(
+      cupy_buffer.attr("__cuda_array_interface__"));
   // Ref: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
   // example: {'shape': (2, 2), 'typestr': '<c16', 'descr': [('', '<c16')],
   // 'stream': 1, 'version': 3, 'strides': None, 'data': (140222144708608,
   // False)}
-  py::tuple dataInfo = cupy_array_info["data"].cast<py::tuple>();
-  void *dataPtr = (void *)dataInfo[0].cast<int64_t>();
-  const bool readOnly = dataInfo[1].cast<bool>();
-  auto shapeTuple = cupy_array_info["shape"].cast<py::tuple>();
+  nanobind::tuple dataInfo =
+      nanobind::cast<nanobind::tuple>(cupy_array_info["data"]);
+  void *dataPtr = (void *)nanobind::cast<int64_t>(dataInfo[0]);
+  const bool readOnly = nanobind::cast<bool>(dataInfo[1]);
+  auto shapeTuple = nanobind::cast<nanobind::tuple>(cupy_array_info["shape"]);
   std::vector<std::size_t> extents;
   for (std::size_t i = 0; i < shapeTuple.size(); i++) {
-    extents.push_back(shapeTuple[i].cast<std::size_t>());
+    extents.push_back(nanobind::cast<std::size_t>(shapeTuple[i]));
   }
-  const std::string typeStr = cupy_array_info["typestr"].cast<std::string>();
+  const std::string typeStr =
+      nanobind::cast<std::string>(cupy_array_info["typestr"]);
   if (typeStr != "<c16" && typeStr != "<c8") {
     throw std::runtime_error("Unsupported typestr in CuPy array: " + typeStr +
                              ". Supported types are: <c16 and <c8.");
@@ -228,30 +253,71 @@ static py::buffer_info getCupyBufferInfo(py::buffer cupy_buffer) {
 
   const bool isDoublePrecision = typeStr == "<c16";
 
-  auto [dataTypeSize, desc] =
-      !isDoublePrecision
-          ? std::make_tuple(
-                sizeof(std::complex<float>),
-                py::format_descriptor<std::complex<float>>::format())
-          : std::make_tuple(
-                sizeof(std::complex<double>),
-                py::format_descriptor<std::complex<double>>::format());
+  std::size_t dataTypeSize = isDoublePrecision ? sizeof(std::complex<double>)
+                                               : sizeof(std::complex<float>);
+  std::string desc = isDoublePrecision ? "Zd" : "Zf";
 
   std::vector<ssize_t> strides(extents.size(), dataTypeSize);
   for (size_t i = 1; i < extents.size(); ++i)
     strides[i] = strides[i - 1] * extents[i - 1];
 
-  return py::buffer_info(dataPtr, dataTypeSize, /*itemsize */
-                         desc, extents.size(),  /* ndim */
-                         extents,               /* shape */
-                         strides,               /* strides */
-                         readOnly               /* readonly */
-  );
+  std::size_t totalSize = 1;
+  for (auto e : extents)
+    totalSize *= e;
+
+  BufferInfo info;
+  info.ptr = dataPtr;
+  info.itemsize = dataTypeSize;
+  info.format = desc;
+  info.ndim = extents.size();
+  info.shape = extents;
+  info.strides = strides;
+  info.readonly = readOnly;
+  info.size = totalSize;
+  return info;
 }
 
-static cudaq::state createStateFromPyBuffer(py::buffer data,
+/// @brief Helper to get BufferInfo from a numpy array via Python buffer
+/// protocol.
+static BufferInfo getNumpyBufferInfo(nanobind::object numpy_array) {
+  nanobind::module_ np = nanobind::module_::import_("numpy");
+  auto dtype = numpy_array.attr("dtype");
+  std::string dtypeStr = nanobind::cast<std::string>(dtype.attr("name"));
+
+  BufferInfo info;
+  if (dtypeStr == "complex64") {
+    info.itemsize = sizeof(std::complex<float>);
+    info.format = "Zf";
+  } else if (dtypeStr == "complex128") {
+    info.itemsize = sizeof(std::complex<double>);
+    info.format = "Zd";
+  } else {
+    info.format = dtypeStr;
+    info.itemsize = nanobind::cast<std::size_t>(dtype.attr("itemsize"));
+  }
+  auto shapeTuple = nanobind::cast<nanobind::tuple>(numpy_array.attr("shape"));
+  info.ndim = shapeTuple.size();
+  info.size = 1;
+  for (std::size_t i = 0; i < shapeTuple.size(); i++) {
+    auto ext = nanobind::cast<std::size_t>(shapeTuple[i]);
+    info.shape.push_back(ext);
+    info.size *= ext;
+  }
+  auto stridesTuple =
+      nanobind::cast<nanobind::tuple>(numpy_array.attr("strides"));
+  for (std::size_t i = 0; i < stridesTuple.size(); i++) {
+    info.strides.push_back(nanobind::cast<ssize_t>(stridesTuple[i]));
+  }
+  // Get the raw data pointer via numpy's ctypes interface
+  info.ptr = reinterpret_cast<void *>(
+      nanobind::cast<intptr_t>(numpy_array.attr("ctypes").attr("data")));
+  info.readonly = false;
+  return info;
+}
+
+static cudaq::state createStateFromPyBuffer(nanobind::object data,
                                             LinkedLibraryHolder &holder) {
-  const bool isHostData = !py::hasattr(data, "__cuda_array_interface__");
+  const bool isHostData = !nanobind::hasattr(data, "__cuda_array_interface__");
   // Check that the target is GPU-based, i.e., can handle device
   // pointer.
   if (!holder.getTarget().config.GpuRequired && !isHostData)
@@ -259,12 +325,11 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
         fmt::format("Current target '{}' does not support CuPy arrays.",
                     holder.getTarget().name));
 
-  auto info = isHostData ? data.request() : getCupyBufferInfo(data);
+  auto info = isHostData ? getNumpyBufferInfo(data) : getCupyBufferInfo(data);
   if (info.shape.size() > 2)
     throw std::runtime_error(
         "state.from_data only supports 1D or 2D array data.");
-  if (info.format != py::format_descriptor<std::complex<float>>::format() &&
-      info.format != py::format_descriptor<std::complex<double>>::format())
+  if (info.format != "Zf" && info.format != "Zd")
     throw std::runtime_error(
         "A numpy array with only floating point elements passed to "
         "`state.from_data`. Input must be of complex float type. Please add to "
@@ -273,7 +338,7 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
         "`dtype=cudaq.complex()` for precision-agnostic code.");
 
   if (!isHostData || info.shape.size() == 1) {
-    if (info.format == py::format_descriptor<std::complex<float>>::format())
+    if (info.format == "Zf")
       return state::from_data(std::make_pair(
           reinterpret_cast<std::complex<float> *>(info.ptr), info.size));
 
@@ -286,8 +351,7 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
       throw std::runtime_error(
           "state.from_data 2D array (density matrix) input must be "
           "square matrix data.");
-    const bool isDoublePrecision =
-        info.format == py::format_descriptor<std::complex<double>>::format();
+    const bool isDoublePrecision = (info.format == "Zd");
     const int64_t dataSize = isDoublePrecision ? sizeof(std::complex<double>)
                                                : sizeof(std::complex<float>);
     const bool rowMajor =
@@ -313,15 +377,15 @@ static cudaq::state createStateFromPyBuffer(py::buffer data,
 }
 
 /// @brief Bind the get_state cudaq function
-void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
-  py::enum_<InitialState>(mod, "InitialStateType",
-                          "Enumeration describing the initial state "
-                          "type to be created in the backend")
+void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
+  nanobind::enum_<InitialState>(mod, "InitialStateType",
+                                "Enumeration describing the initial state "
+                                "type to be created in the backend")
       .value("ZERO", InitialState::ZERO)
       .value("UNIFORM", InitialState::UNIFORM)
       .export_values();
 
-  py::class_<SimulationState::Tensor>(
+  nanobind::class_<SimulationState::Tensor>(
       mod, "Tensor",
       "The `Tensor` describes a pointer to simulation data as well as the rank "
       "and extents for that tensorial data it represents.")
@@ -329,87 +393,112 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
            [](SimulationState::Tensor &tensor) {
              return reinterpret_cast<intptr_t>(tensor.data);
            })
-      .def_readonly("extents", &SimulationState::Tensor::extents)
+      .def_ro("extents", &SimulationState::Tensor::extents)
       .def("get_rank", &SimulationState::Tensor::get_rank)
       .def("get_element_size", &SimulationState::Tensor::element_size)
       .def("get_num_elements", &SimulationState::Tensor::get_num_elements);
 
-  py::class_<state>(
-      mod, "State", py::buffer_protocol(),
+  nanobind::class_<state>(
+      mod, "State",
       "A data-type representing the quantum state of the internal simulator. "
       "This type is not user-constructible and instances can only be retrieved "
       "via the `cudaq.get_state(...)` function or the static "
       "`cudaq.State.from_data()` method.\n")
-      .def_buffer([](const state &self) {
-        if (self.get_num_tensors() != 1)
-          throw std::runtime_error("Numpy interop is only supported for vector "
-                                   "and matrix state data.");
-
-        // This method is used by Pybind to enable interoperability with NumPy
-        // array data. We therefore must be careful since the state data may
-        // actually be on GPU device.
-
-        // Get the data pointer.
-        // Data may be on GPU device, if so we must make a copy to host.
-        // If users do not want this copy, they will have to operate apart
-        // from Numpy
-        void *dataPtr = nullptr;
-        auto stateVector = self.get_tensor();
-        auto precision = self.get_precision();
-        if (self.is_on_gpu()) {
-          // This is device data, transfer to host, which gives us
-          // ownership of a new data pointer on host. Store it globally
-          // here so we ensure that it gets cleaned up.
-          auto numElements = stateVector.get_num_elements();
-          if (precision == SimulationState::precision::fp32) {
-            auto *hostData = new std::complex<float>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          } else {
-            auto *hostData = new std::complex<double>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          }
-          hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) {
-            CUDAQ_INFO("freeing data that was copied from GPU device for "
-                       "compatibility with NumPy");
-            // Use delete[] to match new[] allocation (not free())
-            if (precision == SimulationState::precision::fp32)
-              delete[] static_cast<std::complex<float> *>(data);
-            else
-              delete[] static_cast<std::complex<double> *>(data);
-          });
-        } else {
-          dataPtr = self.get_tensor().data;
-        }
-
-        // We need to know the precision of the simulation data to get the
-        // data type size and the format descriptor
-        auto [dataTypeSize, desc] =
-            precision == SimulationState::precision::fp32
-                ? std::make_tuple(
-                      sizeof(std::complex<float>),
-                      py::format_descriptor<std::complex<float>>::format())
-                : std::make_tuple(
-                      sizeof(std::complex<double>),
-                      py::format_descriptor<std::complex<double>>::format());
-
-        // Get the shape of the data. Return buffer info in a correctly
-        // shaped manner.
-        auto shape = self.get_tensor().extents;
-        if (shape.size() != 1)
-          return py::buffer_info(dataPtr, dataTypeSize, /*itemsize */
-                                 desc, 2,               /* ndim */
-                                 {shape[0], shape[1]},  /* shape */
-                                 {dataTypeSize * static_cast<ssize_t>(shape[1]),
-                                  dataTypeSize}, /* strides */
-                                 true            /* readonly */
-          );
-        return py::buffer_info(dataPtr, dataTypeSize, /*itemsize */
-                               desc, 1,               /* ndim */
-                               {shape[0]},            /* shape */
-                               {dataTypeSize});
-      })
+      .def(
+          "__array__",
+          [](const state &self, nanobind::object dtype_obj,
+             nanobind::object copy_obj) {
+            if (self.get_num_tensors() != 1)
+              throw std::runtime_error(
+                  "Numpy interop is only supported for vector "
+                  "and matrix state data.");
+
+            // This method enables interoperability with NumPy array data.
+            // We must be careful since the state data may actually be on GPU
+            // device.
+
+            nanobind::module_ np = nanobind::module_::import_("numpy");
+            auto stateVector = self.get_tensor();
+            auto precision = self.get_precision();
+            auto shape = self.get_tensor().extents;
+
+            // Determine numpy dtype
+            nanobind::object np_dtype =
+                precision == SimulationState::precision::fp32
+                    ? np.attr("complex64")
+                    : np.attr("complex128");
+
+            if (self.is_on_gpu()) {
+              // This is device data, transfer to host
+              auto numElements = stateVector.get_num_elements();
+              nanobind::object arr;
+              if (precision == SimulationState::precision::fp32) {
+                auto *hostData = new std::complex<float>[numElements];
+                self.to_host(hostData, numElements);
+                // Create numpy array and copy data
+                if (shape.size() != 1) {
+                  nanobind::tuple np_shape =
+                      nanobind::make_tuple(shape[0], shape[1]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                } else {
+                  nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                }
+                auto *destPtr = reinterpret_cast<std::complex<float> *>(
+                    nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+                std::memcpy(destPtr, hostData,
+                            numElements * sizeof(std::complex<float>));
+                delete[] hostData;
+              } else {
+                auto *hostData = new std::complex<double>[numElements];
+                self.to_host(hostData, numElements);
+                if (shape.size() != 1) {
+                  nanobind::tuple np_shape =
+                      nanobind::make_tuple(shape[0], shape[1]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                } else {
+                  nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
+                  arr = np.attr("empty")(np_shape, np_dtype);
+                }
+                auto *destPtr = reinterpret_cast<std::complex<double> *>(
+                    nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+                std::memcpy(destPtr, hostData,
+                            numElements * sizeof(std::complex<double>));
+                delete[] hostData;
+              }
+              return arr;
+            }
+
+            // Host data path - wrap existing memory
+            void *dataPtr = self.get_tensor().data;
+            auto numElements = stateVector.get_num_elements();
+            if (shape.size() != 1) {
+              nanobind::tuple np_shape =
+                  nanobind::make_tuple(shape[0], shape[1]);
+              // Use np.frombuffer-like approach: create array from pointer
+              nanobind::object arr = np.attr("empty")(np_shape, np_dtype);
+              auto *destPtr = reinterpret_cast<void *>(
+                  nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+              std::size_t dataTypeSize =
+                  precision == SimulationState::precision::fp32
+                      ? sizeof(std::complex<float>)
+                      : sizeof(std::complex<double>);
+              std::memcpy(destPtr, dataPtr, numElements * dataTypeSize);
+              return arr;
+            }
+            nanobind::tuple np_shape = nanobind::make_tuple(shape[0]);
+            nanobind::object arr = np.attr("empty")(np_shape, np_dtype);
+            auto *destPtr = reinterpret_cast<void *>(
+                nanobind::cast<intptr_t>(arr.attr("ctypes").attr("data")));
+            std::size_t dataTypeSize =
+                precision == SimulationState::precision::fp32
+                    ? sizeof(std::complex<float>)
+                    : sizeof(std::complex<double>);
+            std::memcpy(destPtr, dataPtr, numElements * dataTypeSize);
+            return arr;
+          },
+          nanobind::arg("dtype") = nanobind::none(),
+          nanobind::arg("copy") = nanobind::none())
       .def(
           "__len__",
           [](state &self) {
@@ -433,16 +522,16 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
           "Convert the address of the state object to an integer.")
       .def_static(
           "from_data",
-          [&](py::buffer data) {
+          [&](nanobind::object data) {
             return createStateFromPyBuffer(data, holder);
           },
           "Return a state from data.")
       .def_static(
           "from_data",
-          [&holder](const std::vector<py::buffer> &tensors) {
+          [&holder](const std::vector<nanobind::object> &tensors) {
             const bool isHostData =
                 tensors.empty() ||
-                !py::hasattr(tensors[0], "__cuda_array_interface__");
+                !nanobind::hasattr(tensors[0], "__cuda_array_interface__");
             // Check that the target is GPU-based, i.e., can handle device
             // pointer.
             if (!holder.getTarget().config.GpuRequired && !isHostData)
@@ -451,8 +540,8 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
                   holder.getTarget().name));
             TensorStateData tensorData;
             for (auto &tensor : tensors) {
-              auto info =
-                  isHostData ? tensor.request() : getCupyBufferInfo(tensor);
+              auto info = isHostData ? getNumpyBufferInfo(tensor)
+                                     : getCupyBufferInfo(tensor);
               const std::vector<std::size_t> extents(info.shape.begin(),
                                                      info.shape.end());
               tensorData.emplace_back(
@@ -477,36 +566,38 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
           "Return a state from matrix product state tensor data.")
       .def_static(
           "from_data",
-          [](const py::list &tensors) {
-            // Note: we must use Python type (py::list) for proper overload
-            // resolution. The overload for py::object, intended for cupy arrays
-            // (implementing Python array interface), may be overshadowed by any
-            // std::vector overloads.
+          [](const nanobind::list &tensors) {
+            // Note: we must use Python type (nanobind::list) for proper
+            // overload resolution. The overload for nanobind::object, intended
+            // for cupy arrays (implementing Python array interface), may be
+            // overshadowed by any std::vector overloads.
             TensorStateData tensorData;
-            for (auto &tensor : tensors) {
+            for (auto tensor : tensors) {
               // Make sure this is a CuPy array
-              if (!py::hasattr(tensor, "data"))
+              if (!nanobind::hasattr(tensor, "data"))
                 throw std::runtime_error(
-                    "invalid from_data operation on py::object - "
+                    "invalid from_data operation on nanobind::object - "
                     "only cupy array supported.");
               auto data = tensor.attr("data");
-              if (!py::hasattr(data, "ptr"))
+              if (!nanobind::hasattr(data, "ptr"))
                 throw std::runtime_error(
-                    "invalid from_data operation on py::object tensors - "
+                    "invalid from_data operation on nanobind::object tensors - "
                     "only cupy array supported.");
 
               // We know this is a cupy device pointer. Start by ensuring it is
               // of proper complex type
-              auto typeStr = py::str(tensor.attr("dtype")).cast<std::string>();
+              auto typeStr = nanobind::cast<std::string>(
+                  tensor.attr("dtype").attr("name"));
               if (typeStr != "complex128")
                 throw std::runtime_error(
-                    "invalid from_data operation on py::object tensors - "
+                    "invalid from_data operation on nanobind::object tensors - "
                     "only cupy complex128 tensors supported.");
-              auto shape = tensor.attr("shape").cast<py::tuple>();
+              auto shape =
+                  nanobind::cast<nanobind::tuple>(tensor.attr("shape"));
               std::vector<std::size_t> extents;
               for (auto el : shape)
-                extents.emplace_back(el.cast<std::size_t>());
-              long ptr = data.attr("ptr").cast<long>();
+                extents.emplace_back(nanobind::cast<std::size_t>(el));
+              long ptr = nanobind::cast<long>(data.attr("ptr"));
               tensorData.emplace_back(
                   std::pair<const void *, std::vector<std::size_t>>{
                       reinterpret_cast<std::complex<double> *>(ptr), extents});
@@ -517,24 +608,24 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
           "ndarray).")
       .def_static(
           "from_data",
-          [&holder](py::object opaqueData) {
+          [&holder](nanobind::object opaqueData) {
             // Note: This overload is no longer needed from cupy 13.5+ onward.
             // We can remove it in future releases.
             // Make sure this is a CuPy array
-            if (!py::hasattr(opaqueData, "data"))
+            if (!nanobind::hasattr(opaqueData, "data"))
               throw std::runtime_error(
-                  "invalid from_data operation on py::object - "
+                  "invalid from_data operation on nanobind::object - "
                   "only cupy array supported.");
             auto data = opaqueData.attr("data");
-            if (!py::hasattr(data, "ptr"))
+            if (!nanobind::hasattr(data, "ptr"))
               throw std::runtime_error(
-                  "invalid from_data operation on py::object - "
+                  "invalid from_data operation on nanobind::object - "
                   "only cupy array supported.");
 
             // We know this is a cupy device pointer. Start by ensuring it is of
             // complex type
-            auto typeStr =
-                py::str(opaqueData.attr("dtype")).cast<std::string>();
+            auto typeStr = nanobind::cast<std::string>(
+                opaqueData.attr("dtype").attr("name"));
             if (typeStr.find("float") != std::string::npos)
               throw std::runtime_error(
                   "CuPy array with only floating point elements passed to "
@@ -546,16 +637,17 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
             // Compute the number of elements in the array
             std::vector<std::size_t> extents;
             auto numElements = [&]() {
-              auto shape = opaqueData.attr("shape").cast<py::tuple>();
+              auto shape =
+                  nanobind::cast<nanobind::tuple>(opaqueData.attr("shape"));
               std::size_t numElements = 1;
               for (auto el : shape) {
-                numElements *= el.cast<std::size_t>();
-                extents.emplace_back(el.cast<std::size_t>());
+                numElements *= nanobind::cast<std::size_t>(el);
+                extents.emplace_back(nanobind::cast<std::size_t>(el));
               }
               return numElements;
             }();
 
-            long ptr = data.attr("ptr").cast<long>();
+            long ptr = nanobind::cast<long>(data.attr("ptr"));
             if (holder.getTarget().name == "dynamics") {
               // For dynamics, we need to send on the extents to distinguish
               // state vector vs density matrix.
@@ -587,7 +679,7 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
       .def(
           "getTensor",
           [](state &self, std::size_t idx) { return self.get_tensor(idx); },
-          py::arg("idx") = 0,
+          nanobind::arg("idx") = 0,
           "Return the `idx` tensor making up this state representation.")
       .def(
           "getTensors", [](state &self) { return self.get_tensors(); },
@@ -699,7 +791,7 @@ index pair.
           [](state &self) {
             std::stringstream ss;
             self.dump(ss);
-            py::print(ss.str());
+            nanobind::print(ss.str().c_str());
           },
           "Print the state to the console.")
       .def("__str__",
@@ -714,7 +806,7 @@ index pair.
           "Compute the overlap between the provided :class:`State`'s.")
       .def(
           "overlap",
-          [&holder](state &self, py::buffer &other) {
+          [&holder](state &self, nanobind::object &other) {
             if (self.get_num_tensors() != 1)
               throw std::runtime_error("overlap NumPy interop only supported "
                                        "for vector and matrix state data.");
@@ -724,24 +816,25 @@ index pair.
           "Compute the overlap between the provided :class:`State`'s.")
       .def(
           "overlap",
-          [](state &self, py::object other) {
+          [](state &self, nanobind::object other) {
             // Note: This overload is no longer needed from cupy 13.5+ onward.
             // We can remove it in future releases. Make sure this is a CuPy
             // array
-            if (!py::hasattr(other, "data"))
+            if (!nanobind::hasattr(other, "data"))
               throw std::runtime_error(
-                  "invalid overlap operation on py::object - "
+                  "invalid overlap operation on nanobind::object - "
                   "only cupy array supported.");
             auto data = other.attr("data");
-            if (!py::hasattr(data, "ptr"))
+            if (!nanobind::hasattr(data, "ptr"))
               throw std::runtime_error(
-                  "invalid overlap operation on py::object - "
+                  "invalid overlap operation on nanobind::object - "
                   "only cupy array supported.");
 
             // We know this is a cupy device pointer.
 
             // Start by ensuring it is of complex type
-            auto typeStr = py::str(other.attr("dtype")).cast<std::string>();
+            auto typeStr =
+                nanobind::cast<std::string>(other.attr("dtype").attr("name"));
             if (typeStr.find("float") != std::string::npos)
               throw std::runtime_error(
                   "CuPy array with only floating point elements passed to "
@@ -765,15 +858,15 @@ index pair.
 
             // Compute the number of elements in the other array
             auto numOtherElements = [&]() {
-              auto shape = other.attr("shape").cast<py::tuple>();
+              auto shape = nanobind::cast<nanobind::tuple>(other.attr("shape"));
               std::size_t numElements = 1;
               for (auto el : shape)
-                numElements *= el.cast<std::size_t>();
+                numElements *= nanobind::cast<std::size_t>(el);
               return numElements;
             }();
 
             // Cast the device ptr and perform the overlap
-            long ptr = data.attr("ptr").cast<long>();
+            long ptr = nanobind::cast<long>(data.attr("ptr"));
             if (precision == SimulationState::precision::fp32)
               return self.overlap(state::from_data(
                   std::make_pair(reinterpret_cast<std::complex<float> *>(ptr),
@@ -787,7 +880,8 @@ index pair.
 
   mod.def(
       "get_state_impl",
-      [&](const std::string &shortName, MlirModule module, py::args args) {
+      [&](const std::string &shortName, MlirModule module,
+          nanobind::args args) {
         // Check for unsupported cases.
         if (holder.getTarget().name == "remote-mqpu" ||
             holder.getTarget().name == "orca-photonics")
@@ -800,7 +894,7 @@ index pair.
       },
       "See the python documentation for get_state.");
 
-  py::class_<async_state_result>(
+  nanobind::class_<async_state_result>(
       mod, "AsyncStateResult",
       R"#(A data-type containing the results of a call to :func:`get_state_async`.
 The `AsyncStateResult` models a future-like type, whose
@@ -810,14 +904,14 @@ See `future <https://en.cppreference.com/w/cpp/thread/future>`_
 for more information on this programming pattern.)#")
       .def(
           "get", [](async_state_result &self) { return self.get(); },
-          py::call_guard<py::gil_scoped_release>(),
+          nanobind::call_guard<nanobind::gil_scoped_release>(),
           "Return the :class:`State` from the asynchronous `get_state` "
           "accessor execution.\n");
 
   mod.def(
       "get_state_async_impl",
       [&](const std::string &shortName, MlirModule module, std::size_t qpu_id,
-          py::args args) {
+          nanobind::args args) {
         // Check for unsupported cases.
         if (holder.getTarget().name == "remote-mqpu" ||
             holder.getTarget().name == "nvqc" ||
diff --git a/python/runtime/cudaq/algorithms/py_state.h b/python/runtime/cudaq/algorithms/py_state.h
index e290aa35e1a..7a7152f8d1f 100644
--- a/python/runtime/cudaq/algorithms/py_state.h
+++ b/python/runtime/cudaq/algorithms/py_state.h
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 
 namespace cudaq {
 class LinkedLibraryHolder;
 
-void bindPyState(pybind11::module &mod, LinkedLibraryHolder &holder);
+void bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_translate.cpp b/python/runtime/cudaq/algorithms/py_translate.cpp
index 4d5f834ed0c..503cbc38cce 100644
--- a/python/runtime/cudaq/algorithms/py_translate.cpp
+++ b/python/runtime/cudaq/algorithms/py_translate.cpp
@@ -13,8 +13,8 @@
 #include "cudaq/platform/default/python/QPU.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
@@ -23,7 +23,7 @@ using namespace mlir;
 /// @brief Run `cudaq::translate` on the provided kernel.
 static std::string translate_impl(const std::string &shortName,
                                   MlirModule module, const std::string &format,
-                                  py::args runtimeArguments) {
+                                  nanobind::args runtimeArguments) {
   StringRef format_ = format;
   auto formatPair = format_.split(':');
   auto mod = unwrap(module);
@@ -66,7 +66,7 @@ static std::string translate_impl(const std::string &shortName,
 }
 
 /// @brief Bind the translate cudaq function
-void cudaq::bindPyTranslate(py::module &mod) {
+void cudaq::bindPyTranslate(nanobind::module_ &mod) {
   mod.def("translate_impl", translate_impl,
           "See python documentation for translate.");
   // Internal translation to QIR for testing and internal use. Not intended to
diff --git a/python/runtime/cudaq/algorithms/py_translate.h b/python/runtime/cudaq/algorithms/py_translate.h
index 67b43598744..041167f7017 100644
--- a/python/runtime/cudaq/algorithms/py_translate.h
+++ b/python/runtime/cudaq/algorithms/py_translate.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyTranslate(py::module &mod);
+void bindPyTranslate(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_unitary.cpp b/python/runtime/cudaq/algorithms/py_unitary.cpp
index fad6bd1d0c7..3aefbbc957d 100644
--- a/python/runtime/cudaq/algorithms/py_unitary.cpp
+++ b/python/runtime/cudaq/algorithms/py_unitary.cpp
@@ -10,26 +10,25 @@
 #include "cudaq/algorithms/unitary.h"
 #include "runtime/cudaq/operators/py_helpers.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-
-namespace py = pybind11;
+#include "utils/NanobindAdaptors.h"
 
 using namespace cudaq;
 
 /// Compute the unitary of this kernel module.
-static py::array get_unitary_impl(const std::string &shortName,
-                                  MlirModule module, py::args args) {
+static nanobind::object get_unitary_impl(const std::string &shortName,
+                                         MlirModule module,
+                                         nanobind::args args) {
   auto f = [=]() {
     return cudaq::marshal_and_launch_module(shortName, module, args);
   };
 
   // Return as numpy array (dim, dim), complex128
   auto temp = contrib::get_unitary_cmat(std::move(f));
-  return details::cmat_to_numpy(temp);
+  return nanobind::cast(details::cmat_to_numpy(temp));
 }
 
 /// Bind the get_unitary cudaq function
-void cudaq::bindPyUnitary(py::module &mod) {
+void cudaq::bindPyUnitary(nanobind::module_ &mod) {
   mod.def("get_unitary_impl", get_unitary_impl,
           "See python documentation for get_unitary().");
 }
diff --git a/python/runtime/cudaq/algorithms/py_unitary.h b/python/runtime/cudaq/algorithms/py_unitary.h
index ea2ffeca055..fccac11e42b 100644
--- a/python/runtime/cudaq/algorithms/py_unitary.h
+++ b/python/runtime/cudaq/algorithms/py_unitary.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindPyUnitary(py::module &mod);
+void bindPyUnitary(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_utils.cpp b/python/runtime/cudaq/algorithms/py_utils.cpp
index 0c6b16ec7c7..e396f93c3a5 100644
--- a/python/runtime/cudaq/algorithms/py_utils.cpp
+++ b/python/runtime/cudaq/algorithms/py_utils.cpp
@@ -8,52 +8,56 @@
 
 #include "py_utils.h"
 #include "cudaq/utils/cudaq_utils.h"
-#include <pybind11/functional.h>
-#include <pybind11/stl.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
-py::dict get_serializable_var_dict() {
-  py::object json = py::module_::import("json");
-  py::dict serialized_dict;
+nanobind::dict get_serializable_var_dict() {
+  nanobind::object json = nanobind::module_::import_("json");
+  nanobind::dict serialized_dict;
 
   auto try_to_add_item = [&](const auto item) {
     try {
       auto key = item.first;
       auto value = item.second;
 
-      if (key.template cast<std::string>().starts_with("__")) {
+      if (nanobind::cast<std::string>(key).starts_with("__")) {
         // Ignore items that start with "__" (like Python __builtins__, etc.)
-      } else if (py::hasattr(value, "to_json")) {
-        auto type = value.get_type();
+      } else if (nanobind::hasattr(value, "to_json")) {
+        auto type = value.type();
         std::string module =
-            type.attr("__module__").template cast<std::string>();
-        std::string name = type.attr("__name__").template cast<std::string>();
-        auto type_name = py::str(module + "." + name);
-        auto json_key_name = py::str(key) + py::str("/") + type_name;
+            nanobind::cast<std::string>(type.attr("__module__"));
+        std::string name = nanobind::cast<std::string>(type.attr("__name__"));
+        auto type_name = nanobind::str((module + "." + name).c_str());
+        auto json_key_name = nanobind::str(nanobind::str(key).c_str()) +
+                             nanobind::str("/") + type_name;
         serialized_dict[json_key_name] =
             json.attr("loads")(value.attr("to_json")());
-      } else if (py::hasattr(value, "tolist")) {
+      } else if (nanobind::hasattr(value, "tolist")) {
         serialized_dict[key] =
             json.attr("loads")(json.attr("dumps")(value.attr("tolist")()));
       } else {
         serialized_dict[key] = json.attr("loads")(json.attr("dumps")(value));
       }
-    } catch (const py::error_already_set &e) {
+    } catch (const nanobind::python_error &e) {
       // Uncomment the following lines for debug, but all this really means is
       // that we won't send this to the remote server.
 
       // std::cout << "Failed to serialize key '"
-      //           << item.first.template cast<std::string>()
+      //           << nanobind::cast<std::string>(item.first)
       //           << "' : " + std::string(e.what()) << std::endl;
     }
   };
 
-  for (const auto item : py::globals())
+  for (const auto item : nanobind::globals())
     try_to_add_item(item);
 
-  py::object inspect = py::module::import("inspect");
-  std::vector<py::object> frame_vec;
+  nanobind::object inspect = nanobind::module_::import_("inspect");
+  std::vector<nanobind::object> frame_vec;
   auto current_frame = inspect.attr("currentframe")();
   while (current_frame && !current_frame.is_none()) {
     frame_vec.push_back(current_frame);
@@ -64,7 +68,8 @@ py::dict get_serializable_var_dict() {
   // globals first to locals last. This ensures that the overwrites give
   // precedence to closest-to-locals.
   for (auto it = frame_vec.rbegin(); it != frame_vec.rend(); ++it) {
-    py::dict f_locals = it->attr("f_locals");
+    nanobind::dict f_locals =
+        nanobind::cast<nanobind::dict>(it->attr("f_locals"));
     for (const auto item : f_locals)
       try_to_add_item(item);
   }
@@ -104,56 +109,60 @@ static std::size_t strip_leading_whitespace(std::string &source_code) {
   return min_indent;
 }
 
-std::string get_source_code(const py::function &func) {
+std::string get_source_code(const nanobind::callable &func) {
   // Get the source code
-  py::module_ analysis = py::module_::import("cudaq.kernel.analysis");
-  py::object FetchDepFuncsSourceCode = analysis.attr("FetchDepFuncsSourceCode");
-  py::object source_code;
+  nanobind::module_ analysis =
+      nanobind::module_::import_("cudaq.kernel.analysis");
+  nanobind::object FetchDepFuncsSourceCode =
+      analysis.attr("FetchDepFuncsSourceCode");
+  nanobind::object source_code;
   try {
     source_code = FetchDepFuncsSourceCode.attr("fetch")(func);
-  } catch (py::error_already_set &e) {
+  } catch (nanobind::python_error &e) {
     throw std::runtime_error("Failed to get source code: " +
                              std::string(e.what()));
   }
 
-  std::string source = source_code.cast<std::string>();
+  std::string source = nanobind::cast<std::string>(source_code);
   strip_leading_whitespace(source);
   return source;
 }
 
-std::string get_var_name_for_handle(const py::handle &h) {
-  py::object inspect = py::module::import("inspect");
+std::string get_var_name_for_handle(const nanobind::handle &h) {
+  nanobind::object inspect = nanobind::module_::import_("inspect");
   // Search locals first, walking up the call stack
   auto current_frame = inspect.attr("currentframe")();
   while (current_frame && !current_frame.is_none()) {
-    py::dict f_locals = current_frame.attr("f_locals");
+    nanobind::dict f_locals =
+        nanobind::cast<nanobind::dict>(current_frame.attr("f_locals"));
     for (auto item : f_locals)
       if (item.second.is(h))
-        return py::str(item.first);
+        return nanobind::cast<std::string>(nanobind::str(item.first));
     current_frame = current_frame.attr("f_back");
   }
   // Search globals now
   current_frame = inspect.attr("currentframe")();
-  py::dict f_globals = current_frame.attr("f_globals");
+  nanobind::dict f_globals =
+      nanobind::cast<nanobind::dict>(current_frame.attr("f_globals"));
   for (auto item : f_globals)
     if (item.second.is(h))
-      return py::str(item.first);
+      return nanobind::cast<std::string>(nanobind::str(item.first));
   return std::string();
 }
 
-std::unordered_map<std::string, std::tuple<py::object, py::dict>>
+std::unordered_map<std::string, std::tuple<nanobind::object, nanobind::dict>>
     DataClassRegistry::classes{};
 
 /// @brief Bind the dataclass registry
-void bindPyDataClassRegistry(py::module &mod) {
-  py::class_<DataClassRegistry>(mod, "DataClassRegistry",
-                                R"#(Registry for dataclasses used in kernels)#")
+void bindPyDataClassRegistry(nanobind::module_ &mod) {
+  nanobind::class_<DataClassRegistry>(
+      mod, "DataClassRegistry", R"#(Registry for dataclasses used in kernels)#")
       .def_static("registerClass", &DataClassRegistry::registerClass,
                   "Register class\n")
       .def_static("isRegisteredClass", &DataClassRegistry::isRegisteredClass,
                   "Is class registered\n")
       .def_static("getClassAttributes", &DataClassRegistry::getClassAttributes,
                   "Find registered class and its attributes\n")
-      .def_readonly_static("classes", &DataClassRegistry::classes);
+      .def_ro_static("classes", &DataClassRegistry::classes);
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/algorithms/py_utils.h b/python/runtime/cudaq/algorithms/py_utils.h
index 84dc1e6455c..2abd81d122a 100644
--- a/python/runtime/cudaq/algorithms/py_utils.h
+++ b/python/runtime/cudaq/algorithms/py_utils.h
@@ -8,35 +8,36 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <string>
 #include <tuple>
 #include <unordered_map>
 
-namespace py = pybind11;
-
 namespace cudaq {
 
 /// @brief Get a JSON-encoded dictionary of a combination of all local
 /// and global variables that are JSON compatible
-py::dict get_serializable_var_dict();
+nanobind::dict get_serializable_var_dict();
 
-/// @brief Fetch the Python source code from a `py::function`
-std::string get_source_code(const py::function &func);
+/// @brief Fetch the Python source code from a `nanobind::callable`
+std::string get_source_code(const nanobind::callable &func);
 
 /// @brief Find the variable name for a given Python object handle. It searches
 /// locally first, walks up the call stack, and finally checks the global
 /// namespace. If not found, it returns an empty string.
-std::string get_var_name_for_handle(const py::handle &h);
+std::string get_var_name_for_handle(const nanobind::handle &h);
 
 /// @brief Registry for python data classes used in kernels
 class DataClassRegistry {
 public:
-  static std::unordered_map<std::string, std::tuple<py::object, py::dict>>
+  static std::unordered_map<std::string,
+                            std::tuple<nanobind::object, nanobind::dict>>
       classes;
 
   /// @brief Register class object
-  static void registerClass(std::string &name, py::object cls) {
-    classes[name] = {cls, cls.attr("__annotations__").cast<py::dict>()};
+  static void registerClass(std::string &name, nanobind::object cls) {
+    classes[name] = {
+        cls, nanobind::cast<nanobind::dict>(cls.attr("__annotations__"))};
   }
 
   /// @brief Is data class name registered
@@ -45,12 +46,12 @@ class DataClassRegistry {
   }
 
   /// @brief Find registered data class object and its attributes
-  static std::tuple<py::object, py::dict>
+  static std::tuple<nanobind::object, nanobind::dict>
   getClassAttributes(std::string &name) {
     return classes[name];
   }
 };
 
-void bindPyDataClassRegistry(py::module &mod);
+void bindPyDataClassRegistry(nanobind::module_ &mod);
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/domains/plugins/CMakeLists.txt b/python/runtime/cudaq/domains/plugins/CMakeLists.txt
index 675919e25ca..3bd2e991655 100644
--- a/python/runtime/cudaq/domains/plugins/CMakeLists.txt
+++ b/python/runtime/cudaq/domains/plugins/CMakeLists.txt
@@ -15,10 +15,12 @@ else()
 endif()
 add_library(cudaq-pyscf SHARED PySCFDriver.cpp)
 
+target_compile_options(cudaq-pyscf PRIVATE -Wno-cast-qual)
+
 if (SKBUILD)
   target_link_libraries(cudaq-pyscf
     PRIVATE
-      pybind11::pybind11 Python::Module
+      nanobind-static Python::Module
       cudaq-chemistry cudaq-operator cudaq cudaq-py-utils cudaq-platform-default)
   # Apple's linker (ld64) doesn't support --unresolved-symbols flag
   if (NOT APPLE)
@@ -31,7 +33,7 @@ else()
   endif()
   target_link_libraries(cudaq-pyscf
     PRIVATE
-      Python::Python pybind11::pybind11
+      nanobind-static Python::Python
       cudaq-chemistry cudaq-operator cudaq cudaq-py-utils cudaq-platform-default)
 endif()
 
diff --git a/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp b/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp
index 1cd6e142a83..8f99b59e231 100644
--- a/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp
+++ b/python/runtime/cudaq/domains/plugins/PySCFDriver.cpp
@@ -9,20 +9,18 @@
 #include "cudaq/domains/chemistry/MoleculePackageDriver.h"
 #include "cudaq/target_control.h"
 #include <map>
-#include <pybind11/embed.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
 
-namespace py = pybind11;
 using namespace cudaq;
 
 namespace {
 
-/// @brief Reference to the pybind11 scoped interpreter
-thread_local static std::unique_ptr<py::scoped_interpreter> interp;
-
-/// @brief Map an OpenFermion QubitOperator represented as a py::object
+/// @brief Map an OpenFermion QubitOperator represented as a nanobind::object
 /// to a CUDA-Q spin_op
-spin_op fromOpenFermionQubitOperator(const py::object &op) {
-  if (!py::hasattr(op, "terms"))
+spin_op fromOpenFermionQubitOperator(const nanobind::object &op) {
+  if (!nanobind::hasattr(op, "terms"))
     throw std::runtime_error(
         "This is not an openfermion operator, must have 'terms' attribute.");
   std::map<std::string, std::function<spin_op_term(std::size_t)>> creatorMap{
@@ -32,20 +30,21 @@ spin_op fromOpenFermionQubitOperator(const py::object &op) {
   auto terms = op.attr("terms");
   auto H = spin_op::empty();
   for (auto term : terms) {
-    auto termTuple = term.cast<py::tuple>();
+    auto termTuple = nanobind::cast<nanobind::tuple>(term);
     auto localTerm = spin_op::identity();
-    for (auto &element : termTuple) {
-      auto casted = element.cast<std::pair<std::size_t, std::string>>();
+    for (auto element : termTuple) {
+      auto casted =
+          nanobind::cast<std::pair<std::size_t, std::string>>(element);
       localTerm *= creatorMap[casted.second](casted.first);
     }
-    H += terms[term].cast<double>() * localTerm;
+    H += nanobind::cast<double>(terms[term]) * localTerm;
   }
   return H;
 }
 
 /// @brief Implement the CUDA-Q MoleculePackageDriver interface
 /// with support for generating molecular Hamiltonians via PySCF. We
-/// achieve this via Pybind11's embedded interpreter capabilities.
+/// achieve this via nanobind's Python API wrappers.
 class PySCFPackageDriver : public MoleculePackageDriver {
 protected:
   /// @brief The name of the chemistry python module.
@@ -62,82 +61,83 @@ class PySCFPackageDriver : public MoleculePackageDriver {
       int multiplicity, int charge,
       std::optional<std::size_t> nActiveElectrons = std::nullopt,
       std::optional<std::size_t> nActiveOrbitals = std::nullopt) override {
-    if (!interp)
-      interp = std::make_unique<py::scoped_interpreter>();
+    if (!Py_IsInitialized())
+      Py_Initialize();
 
     // Convert the molecular_geometry to a list[tuple(str,tuple)]
-    py::list pyGeometry(geometry.size());
-    for (std::size_t counter = 0; auto &atom : geometry) {
-      py::tuple coordinate(3);
+    nanobind::list pyGeometry;
+    for (auto &atom : geometry) {
+      nanobind::object coordinate = nanobind::steal(PyTuple_New(3));
       for (int i = 0; i < 3; i++)
-        coordinate[i] = atom.coordinates[i];
+        PyTuple_SET_ITEM(coordinate.ptr(), i,
+                         nanobind::cast(atom.coordinates[i]).release().ptr());
 
-      pyGeometry[counter++] = py::make_tuple(atom.name, coordinate);
+      pyGeometry.append(nanobind::make_tuple(atom.name, coordinate));
     }
 
     // We don't want to modify the platform, indicate so
     cudaq::__internal__::disableTargetModification();
 
     // Import the cudaq python chemistry module
-    auto cudaqModule = py::module_::import(ChemistryModuleName);
+    auto cudaqModule = nanobind::module_::import_(ChemistryModuleName);
 
     // Reset it
     cudaq::__internal__::enableTargetModification();
 
     // Setup the active space if requested.
-    py::object nElectrons = py::none();
-    py::object nActive = py::none();
+    nanobind::object nElectrons = nanobind::none();
+    nanobind::object nActive = nanobind::none();
     if (nActiveElectrons.has_value())
-      nElectrons = py::int_(nActiveElectrons.value());
+      nElectrons = nanobind::int_(nActiveElectrons.value());
     if (nActiveOrbitals.has_value())
-      nActive = py::int_(nActiveOrbitals.value());
+      nActive = nanobind::int_(nActiveOrbitals.value());
 
     // Run the openfermion-pyscf wrapper to create the hamiltonian + metadata
     auto hamiltonianGen = cudaqModule.attr(CreatorFunctionName);
-    auto resultTuple = hamiltonianGen(pyGeometry, basis, multiplicity, charge,
-                                      nElectrons, nActive)
-                           .cast<py::tuple>();
+    auto resultTuple = nanobind::cast<nanobind::tuple>(hamiltonianGen(
+        pyGeometry, basis, multiplicity, charge, nElectrons, nActive));
 
     // Get the spin_op representation
-    auto spinOp = fromOpenFermionQubitOperator(resultTuple[0]);
+    auto spinOp =
+        fromOpenFermionQubitOperator(nanobind::borrow(resultTuple[0]));
 
     // Get the OpenFermion molecule representation
-    auto openFermionMolecule = resultTuple[1];
+    auto openFermionMolecule = nanobind::borrow(resultTuple[1]);
 
     // Extract the one-body integrals
     auto pyOneBody = openFermionMolecule.attr("one_body_integrals");
-    auto shape = pyOneBody.attr("shape").cast<py::tuple>();
-    one_body_integrals oneBody(
-        {shape[0].cast<std::size_t>(), shape[1].cast<std::size_t>()});
+    auto shape = nanobind::cast<nanobind::tuple>(pyOneBody.attr("shape"));
+    one_body_integrals oneBody({nanobind::cast<std::size_t>(shape[0]),
+                                nanobind::cast<std::size_t>(shape[1])});
     for (std::size_t i = 0; i < oneBody.shape[0]; i++)
       for (std::size_t j = 0; j < oneBody.shape[1]; j++)
-        oneBody(i, j) =
-            pyOneBody.attr("__getitem__")(py::make_tuple(i, j)).cast<double>();
+        oneBody(i, j) = nanobind::cast<double>(
+            pyOneBody.attr("__getitem__")(nanobind::make_tuple(i, j)));
 
     // Extract the two-body integrals
     auto pyTwoBody = openFermionMolecule.attr("two_body_integrals");
-    shape = pyTwoBody.attr("shape").cast<py::tuple>();
-    two_body_integals twoBody(
-        {shape[0].cast<std::size_t>(), shape[1].cast<std::size_t>(),
-         shape[2].cast<std::size_t>(), shape[3].cast<std::size_t>()});
+    shape = nanobind::cast<nanobind::tuple>(pyTwoBody.attr("shape"));
+    two_body_integals twoBody({nanobind::cast<std::size_t>(shape[0]),
+                               nanobind::cast<std::size_t>(shape[1]),
+                               nanobind::cast<std::size_t>(shape[2]),
+                               nanobind::cast<std::size_t>(shape[3])});
     for (std::size_t i = 0; i < twoBody.shape[0]; i++)
       for (std::size_t j = 0; j < twoBody.shape[1]; j++)
         for (std::size_t k = 0; k < twoBody.shape[2]; k++)
           for (std::size_t l = 0; l < twoBody.shape[3]; l++)
-            twoBody(i, j, k, l) =
-                pyTwoBody.attr("__getitem__")(py::make_tuple(i, j, k, l))
-                    .cast<double>();
+            twoBody(i, j, k, l) = nanobind::cast<double>(pyTwoBody.attr(
+                "__getitem__")(nanobind::make_tuple(i, j, k, l)));
 
     // return a new molecular_hamiltonian
     return molecular_hamiltonian{
         spinOp,
         std::move(oneBody),
         std::move(twoBody),
-        openFermionMolecule.attr("n_electrons").cast<std::size_t>(),
-        openFermionMolecule.attr("n_orbitals").cast<std::size_t>(),
-        openFermionMolecule.attr("nuclear_repulsion").cast<double>(),
-        openFermionMolecule.attr("hf_energy").cast<double>(),
-        openFermionMolecule.attr("fci_energy").cast<double>()};
+        nanobind::cast<std::size_t>(openFermionMolecule.attr("n_electrons")),
+        nanobind::cast<std::size_t>(openFermionMolecule.attr("n_orbitals")),
+        nanobind::cast<double>(openFermionMolecule.attr("nuclear_repulsion")),
+        nanobind::cast<double>(openFermionMolecule.attr("hf_energy")),
+        nanobind::cast<double>(openFermionMolecule.attr("fci_energy"))};
   }
 };
 
diff --git a/python/runtime/cudaq/dynamics/CMakeLists.txt b/python/runtime/cudaq/dynamics/CMakeLists.txt
index c56a4c3672d..d7910fdf586 100644
--- a/python/runtime/cudaq/dynamics/CMakeLists.txt
+++ b/python/runtime/cudaq/dynamics/CMakeLists.txt
@@ -7,13 +7,9 @@
 # ============================================================================ #
 
 find_package(Python COMPONENTS Interpreter Development)
-find_package(pybind11 CONFIG)
 
-pybind11_add_module(nvqir_dynamics_bindings pyDynamics.cpp)
-target_include_directories(nvqir_dynamics_bindings PRIVATE
-    ${PYTHON_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIRS}
-)
+nanobind_add_module(nvqir_dynamics_bindings NB_STATIC pyDynamics.cpp)
+
 target_include_directories(nvqir_dynamics_bindings 
     PRIVATE 
         ${CMAKE_SOURCE_DIR}/runtime
diff --git a/python/runtime/cudaq/dynamics/pyDynamics.cpp b/python/runtime/cudaq/dynamics/pyDynamics.cpp
index 5a4ee1380ac..1fdccbedcaa 100644
--- a/python/runtime/cudaq/dynamics/pyDynamics.cpp
+++ b/python/runtime/cudaq/dynamics/pyDynamics.cpp
@@ -15,10 +15,14 @@
 #include "cudaq/algorithms/base_integrator.h"
 #include "cudaq/algorithms/integrator.h"
 #include "cudaq/schedule.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
 namespace {
 cudaq::CuDensityMatState *asCudmState(cudaq::state &cudaqState) {
   auto *simState = cudaq::state_helper::getSimulationState(&cudaqState);
@@ -30,7 +34,7 @@ cudaq::CuDensityMatState *asCudmState(cudaq::state &cudaqState) {
 } // namespace
 
 // Internal dynamics bindings
-PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
+NB_MODULE(nvqir_dynamics_bindings, m) {
   class PyCuDensityMatTimeStepper : public cudaq::CuDensityMatTimeStepper {
   public:
     PyCuDensityMatTimeStepper(cudensitymatHandle_t handle,
@@ -42,76 +46,80 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
   };
 
   // Time stepper bindings
-  py::class_<PyCuDensityMatTimeStepper>(m, "TimeStepper")
-      .def(py::init(
-          [](cudaq::schedule schedule, std::vector<int64_t> modeExtents,
-             cudaq::sum_op<cudaq::matrix_handler> hamiltonian,
-             std::vector<cudaq::sum_op<cudaq::matrix_handler>> collapse_ops,
-             bool is_master_equation) {
-            std::unordered_map<std::string, std::complex<double>> params;
-            for (const auto &param : schedule.get_parameters()) {
-              params[param] = schedule.get_value_function()(param, 0.0);
-            }
-            auto liouvillian = cudaq::dynamics::Context::getCurrentContext()
-                                   ->getOpConverter()
-                                   .constructLiouvillian(
-                                       {hamiltonian}, {collapse_ops},
-                                       modeExtents, params, is_master_equation);
-            return PyCuDensityMatTimeStepper(
-                cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-                liouvillian, schedule);
-          }))
-      .def(py::init([](cudaq::schedule schedule,
-                       std::vector<int64_t> modeExtents,
-                       cudaq::super_op superOp) {
-        std::unordered_map<std::string, std::complex<double>> params;
-        for (const auto &param : schedule.get_parameters()) {
-          params[param] = schedule.get_value_function()(param, 0.0);
-        }
-        auto liouvillian =
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .constructLiouvillian({superOp}, modeExtents, params);
-        return PyCuDensityMatTimeStepper(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            liouvillian, schedule);
-      }))
-      .def(py::init([](cudaq::schedule schedule,
-                       std::vector<int64_t> modeExtents,
-                       const std::vector<cudaq::sum_op<cudaq::matrix_handler>>
-                           &hamiltonians,
-                       const std::vector<
-                           std::vector<cudaq::sum_op<cudaq::matrix_handler>>>
-                           &list_collapse_ops,
-                       bool is_master_equation) {
-        std::unordered_map<std::string, std::complex<double>> params;
-        for (const auto &param : schedule.get_parameters()) {
-          params[param] = schedule.get_value_function()(param, 0.0);
-        }
-        auto liouvillian =
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .constructLiouvillian(hamiltonians, list_collapse_ops,
-                                      modeExtents, params, is_master_equation);
-        return PyCuDensityMatTimeStepper(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            liouvillian, schedule);
-      }))
-      .def(py::init([](cudaq::schedule schedule,
-                       std::vector<int64_t> modeExtents,
-                       const std::vector<cudaq::super_op> &superOps) {
-        std::unordered_map<std::string, std::complex<double>> params;
-        for (const auto &param : schedule.get_parameters()) {
-          params[param] = schedule.get_value_function()(param, 0.0);
-        }
-        auto liouvillian =
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .constructLiouvillian(superOps, modeExtents, params);
-        return PyCuDensityMatTimeStepper(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            liouvillian, schedule);
-      }))
+  nanobind::class_<PyCuDensityMatTimeStepper>(m, "TimeStepper")
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents,
+              cudaq::sum_op<cudaq::matrix_handler> hamiltonian,
+              std::vector<cudaq::sum_op<cudaq::matrix_handler>> collapse_ops,
+              bool is_master_equation) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian({hamiltonian}, {collapse_ops},
+                                           modeExtents, params,
+                                           is_master_equation);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents, cudaq::super_op superOp) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian({superOp}, modeExtents, params);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents,
+              const std::vector<cudaq::sum_op<cudaq::matrix_handler>>
+                  &hamiltonians,
+              const std::vector<std::vector<
+                  cudaq::sum_op<cudaq::matrix_handler>>> &list_collapse_ops,
+              bool is_master_equation) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian(hamiltonians, list_collapse_ops,
+                                           modeExtents, params,
+                                           is_master_equation);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
+      .def("__init__",
+           [](PyCuDensityMatTimeStepper *self, cudaq::schedule schedule,
+              std::vector<int64_t> modeExtents,
+              const std::vector<cudaq::super_op> &superOps) {
+             std::unordered_map<std::string, std::complex<double>> params;
+             for (const auto &param : schedule.get_parameters()) {
+               params[param] = schedule.get_value_function()(param, 0.0);
+             }
+             auto liouvillian =
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .constructLiouvillian(superOps, modeExtents, params);
+             new (self) PyCuDensityMatTimeStepper(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 liouvillian, schedule);
+           })
       .def("compute",
            [](PyCuDensityMatTimeStepper &self, cudaq::state &inputState,
               double t) {
@@ -124,7 +132,6 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
       .def("compute",
            [](PyCuDensityMatTimeStepper &self, cudaq::state &inputState,
               double t, cudaq::state &outputState) {
-             // Compute into the provided output state
              std::unordered_map<std::string, std::complex<double>> params;
              for (const auto &param : self.m_schedule.get_parameters()) {
                params[param] = self.m_schedule.get_value_function()(param, t);
@@ -151,24 +158,26 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
            });
 
   // System dynamics data class
-  py::class_<cudaq::SystemDynamics>(m, "SystemDynamics")
-      .def(py::init<>())
-      .def_readwrite("modeExtents", &cudaq::SystemDynamics::modeExtents)
-      .def_readwrite("hamiltonian", &cudaq::SystemDynamics::hamiltonian)
-      .def_readwrite("collapseOps", &cudaq::SystemDynamics::collapseOps)
-      .def_readwrite("parameters", &cudaq::SystemDynamics::parameters)
-      .def_readwrite("superOp", &cudaq::SystemDynamics::superOp);
+  nanobind::class_<cudaq::SystemDynamics>(m, "SystemDynamics")
+      .def(nanobind::init<>())
+      .def_rw("modeExtents", &cudaq::SystemDynamics::modeExtents)
+      .def_rw("hamiltonian", &cudaq::SystemDynamics::hamiltonian)
+      .def_rw("collapseOps", &cudaq::SystemDynamics::collapseOps)
+      .def_rw("parameters", &cudaq::SystemDynamics::parameters)
+      .def_rw("superOp", &cudaq::SystemDynamics::superOp);
 
   // Expectation calculation
-  py::class_<cudaq::CuDensityMatExpectation>(m, "CuDensityMatExpectation")
-      .def(py::init([](cudaq::sum_op<cudaq::matrix_handler> &obs,
-                       const std::vector<int64_t> &modeExtents) {
-        return cudaq::CuDensityMatExpectation(
-            cudaq::dynamics::Context::getCurrentContext()->getHandle(),
-            cudaq::dynamics::Context::getCurrentContext()
-                ->getOpConverter()
-                .convertToCudensitymatOperator({}, obs, modeExtents));
-      }))
+  nanobind::class_<cudaq::CuDensityMatExpectation>(m, "CuDensityMatExpectation")
+      .def("__init__",
+           [](cudaq::CuDensityMatExpectation *self,
+              cudaq::sum_op<cudaq::matrix_handler> &obs,
+              const std::vector<int64_t> &modeExtents) {
+             new (self) cudaq::CuDensityMatExpectation(
+                 cudaq::dynamics::Context::getCurrentContext()->getHandle(),
+                 cudaq::dynamics::Context::getCurrentContext()
+                     ->getOpConverter()
+                     .convertToCudensitymatOperator({}, obs, modeExtents));
+           })
       .def("prepare",
            [](cudaq::CuDensityMatExpectation &self, cudaq::state &state) {
              auto *cudmState = asCudmState(state);
@@ -187,9 +196,9 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
       });
 
   // Schedule class
-  py::class_<cudaq::schedule>(m, "Schedule")
-      .def(py::init<const std::vector<double> &,
-                    const std::vector<std::string> &>());
+  nanobind::class_<cudaq::schedule>(m, "Schedule")
+      .def(nanobind::init<const std::vector<double> &,
+                          const std::vector<std::string> &>());
 
   // Helper to initialize a data buffer state
   m.def("initializeState",
@@ -287,23 +296,24 @@ PYBIND11_MODULE(nvqir_dynamics_bindings, m) {
         return cudaq::__internal__::checkBatchingCompatibility(hamOps,
                                                                listCollapseOps);
       },
-      py::arg("hamiltonians"), py::arg("collapse_operators"));
+      nanobind::arg("hamiltonians"), nanobind::arg("collapse_operators"));
 
   m.def(
       "checkSuperOpBatchingCompatibility",
       [](const std::vector<cudaq::super_op> &super_operators) {
         return cudaq::__internal__::checkBatchingCompatibility(super_operators);
       },
-      py::arg("super_operators"));
+      nanobind::arg("super_operators"));
 
   auto integratorsSubmodule = m.def_submodule("integrators");
 
   // Runge-Kutta integrator
-  py::class_<cudaq::integrators::runge_kutta>(integratorsSubmodule,
-                                              "runge_kutta")
-      .def(py::init<int, std::optional<double>>(), py::kw_only(),
-           py::arg("order") = cudaq::integrators::runge_kutta::default_order,
-           py::arg("max_step_size") = py::none())
+  nanobind::class_<cudaq::integrators::runge_kutta>(integratorsSubmodule,
+                                                    "runge_kutta")
+      .def(nanobind::init<int, std::optional<double>>(), nanobind::kw_only(),
+           nanobind::arg("order") =
+               cudaq::integrators::runge_kutta::default_order,
+           nanobind::arg("max_step_size") = nanobind::none())
       .def("setState",
            [](cudaq::integrators::runge_kutta &self, cudaq::state &state,
               double t) { self.setState(state, t); })
diff --git a/python/runtime/cudaq/operators/py_boson_op.cpp b/python/runtime/cudaq/operators/py_boson_op.cpp
index 5a23d29a9af..6df75bd5a27 100644
--- a/python/runtime/cudaq/operators/py_boson_op.cpp
+++ b/python/runtime/cudaq/operators/py_boson_op.cpp
@@ -7,10 +7,17 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -19,7 +26,7 @@
 
 namespace cudaq {
 
-void bindBosonModule(py::module &mod) {
+void bindBosonModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::boson` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto boson_submodule = mod.def_submodule("boson");
@@ -32,31 +39,32 @@ void bindBosonModule(py::module &mod) {
       "Returns product operator with constant value 1.");
   boson_submodule.def(
       "identity", [](std::size_t target) { return boson_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   boson_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return boson_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
   boson_submodule.def(
-      "create", &boson_op::create<boson_handler>, py::arg("target"),
+      "create", &boson_op::create<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic creation operator on the given target index.");
   boson_submodule.def(
-      "annihilate", &boson_op::annihilate<boson_handler>, py::arg("target"),
+      "annihilate", &boson_op::annihilate<boson_handler>,
+      nanobind::arg("target"),
       "Returns a bosonic annihilation operator on the given target index.");
   boson_submodule.def(
-      "number", &boson_op::number<boson_handler>, py::arg("target"),
+      "number", &boson_op::number<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic number operator on the given target index.");
   boson_submodule.def(
-      "position", &boson_op::position<boson_handler>, py::arg("target"),
+      "position", &boson_op::position<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic position operator on the given target index.");
   boson_submodule.def(
-      "momentum", &boson_op::momentum<boson_handler>, py::arg("target"),
+      "momentum", &boson_op::momentum<boson_handler>, nanobind::arg("target"),
       "Returns a bosonic momentum operator on the given target index.");
   boson_submodule.def(
       "canonicalized",
@@ -90,50 +98,52 @@ void bindBosonModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindBosonOperator(py::module &mod) {
+void bindBosonOperator(nanobind::module_ &mod) {
 
-  auto boson_op_class = py::class_<boson_op>(mod, "BosonOperator");
+  auto boson_op_class = nanobind::class_<boson_op>(mod, "BosonOperator");
   auto boson_op_term_class =
-      py::class_<boson_op_term>(mod, "BosonOperatorTerm");
+      nanobind::class_<boson_op_term>(mod, "BosonOperatorTerm");
 
   boson_op_class
       .def(
           "__iter__",
           [](boson_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<boson_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &boson_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &boson_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &boson_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &boson_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &boson_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &boson_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &boson_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &boson_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &boson_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &boson_op::num_terms,
+                   "Returns the number of terms in the operator.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -142,12 +152,12 @@ void bindBosonOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(),
+      .def(nanobind::init<std::size_t>(),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms.")
-      .def(py::init<const boson_op_term &>(),
+      .def(nanobind::init<const boson_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const boson_op &>(), "Copy constructor.")
+      .def(nanobind::init<const boson_op &>(), "Copy constructor.")
       .def(
           "copy", [](const boson_op &self) { return boson_op(self); },
           "Creates a copy of the operator.")
@@ -161,9 +171,9 @@ void bindBosonOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -173,13 +183,13 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const boson_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -192,9 +202,9 @@ void bindBosonOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -208,12 +218,12 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const boson_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -227,7 +237,7 @@ void bindBosonOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &boson_op::operator==, py::is_operator(),
+      .def("__eq__", &boson_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -239,91 +249,91 @@ void bindBosonOperator(py::module &mod) {
           [](const boson_op &self, const boson_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= boson_op_term(), py::is_operator())
-      .def(py::self += boson_op_term(), py::is_operator())
-      .def(py::self -= boson_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self += boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * boson_op_term(), py::is_operator())
-      .def(py::self + boson_op_term(), py::is_operator())
-      .def(py::self - boson_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self + boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self - boson_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -351,17 +361,17 @@ void bindBosonOperator(py::module &mod) {
       .def("dump", &boson_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &boson_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+      .def("trim", &boson_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](boson_op &self, double tol, const py::kwargs &kwargs) {
+          [](boson_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -386,42 +396,44 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "__iter__",
           [](boson_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<boson_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &boson_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &boson_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &boson_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &boson_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &boson_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &boson_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &boson_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &boson_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &boson_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &boson_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_id", &boson_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &boson_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -429,30 +441,32 @@ void bindBosonOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
-      .def(py::init<double>(),
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return boson_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<boson_handler>(),
+      .def(
+          "__init__",
+          [](boson_op_term *self, const scalar_operator &scalar) {
+            new (self) boson_op_term(boson_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<boson_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<const boson_op_term &, std::size_t>(), py::arg("operator"),
-           py::arg("size") = 0,
+      .def(nanobind::init<const boson_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
@@ -463,7 +477,7 @@ void bindBosonOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &boson_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -474,9 +488,9 @@ void bindBosonOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -486,13 +500,13 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const boson_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -505,9 +519,9 @@ void bindBosonOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -521,12 +535,12 @@ void bindBosonOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const boson_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -540,7 +554,7 @@ void bindBosonOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &boson_op_term::operator==, py::is_operator(),
+      .def("__eq__", &boson_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -552,77 +566,78 @@ void bindBosonOperator(py::module &mod) {
           [](const boson_op_term &self, const boson_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * boson_op(), py::is_operator())
-      .def(py::self + boson_op(), py::is_operator())
-      .def(py::self - boson_op(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * boson_op(), nanobind::is_operator())
+      .def(nanobind::self + boson_op(), nanobind::is_operator())
+      .def(nanobind::self - boson_op(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &boson_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__", [](const boson_op_term &self) { return self.to_string(); },
           "Returns the string representation of the operator.")
@@ -645,12 +660,12 @@ void bindBosonOperator(py::module &mod) {
           "of freedom that are not included in the given set.");
 }
 
-void bindBosonWrapper(py::module &mod) {
+void bindBosonWrapper(nanobind::module_ &mod) {
   bindBosonOperator(mod);
-  py::implicitly_convertible<double, boson_op_term>();
-  py::implicitly_convertible<std::complex<double>, boson_op_term>();
-  py::implicitly_convertible<scalar_operator, boson_op_term>();
-  py::implicitly_convertible<boson_op_term, boson_op>();
+  nanobind::implicitly_convertible<double, boson_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, boson_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, boson_op_term>();
+  nanobind::implicitly_convertible<boson_op_term, boson_op>();
   bindBosonModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_boson_op.h b/python/runtime/cudaq/operators/py_boson_op.h
index 7952fcab9cf..7f74e49cbc0 100644
--- a/python/runtime/cudaq/operators/py_boson_op.h
+++ b/python/runtime/cudaq/operators/py_boson_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of bosonic
 /// operators to python.
-void bindBosonWrapper(py::module &mod);
+void bindBosonWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_fermion_op.cpp b/python/runtime/cudaq/operators/py_fermion_op.cpp
index 7c0e315d6f5..621f39c873f 100644
--- a/python/runtime/cudaq/operators/py_fermion_op.cpp
+++ b/python/runtime/cudaq/operators/py_fermion_op.cpp
@@ -7,10 +7,18 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -19,7 +27,7 @@
 
 namespace cudaq {
 
-void bindFermionModule(py::module &mod) {
+void bindFermionModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::fermion` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto fermion_submodule = mod.def_submodule("fermion");
@@ -33,25 +41,26 @@ void bindFermionModule(py::module &mod) {
   fermion_submodule.def(
       "identity",
       [](std::size_t target) { return fermion_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   fermion_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return fermion_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
   fermion_submodule.def(
-      "create", &fermion_op::create<fermion_handler>, py::arg("target"),
+      "create", &fermion_op::create<fermion_handler>, nanobind::arg("target"),
       "Returns a fermionic creation operator on the given target index.");
   fermion_submodule.def(
-      "annihilate", &fermion_op::annihilate<fermion_handler>, py::arg("target"),
+      "annihilate", &fermion_op::annihilate<fermion_handler>,
+      nanobind::arg("target"),
       "Returns a fermionic annihilation operator on the given target index.");
   fermion_submodule.def(
-      "number", &fermion_op::number<fermion_handler>, py::arg("target"),
+      "number", &fermion_op::number<fermion_handler>, nanobind::arg("target"),
       "Returns a fermionic number operator on the given target index.");
   fermion_submodule.def(
       "canonicalized",
@@ -85,50 +94,52 @@ void bindFermionModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindFermionOperator(py::module &mod) {
+void bindFermionOperator(nanobind::module_ &mod) {
 
-  auto fermion_op_class = py::class_<fermion_op>(mod, "FermionOperator");
+  auto fermion_op_class = nanobind::class_<fermion_op>(mod, "FermionOperator");
   auto fermion_op_term_class =
-      py::class_<fermion_op_term>(mod, "FermionOperatorTerm");
+      nanobind::class_<fermion_op_term>(mod, "FermionOperatorTerm");
 
   fermion_op_class
       .def(
           "__iter__",
           [](fermion_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<fermion_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &fermion_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &fermion_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &fermion_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &fermion_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &fermion_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &fermion_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &fermion_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &fermion_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &fermion_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &fermion_op::num_terms,
+                   "Returns the number of terms in the operator.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -137,12 +148,12 @@ void bindFermionOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(),
+      .def(nanobind::init<std::size_t>(),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms.")
-      .def(py::init<const fermion_op_term &>(),
+      .def(nanobind::init<const fermion_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const fermion_op &>(), "Copy constructor.")
+      .def(nanobind::init<const fermion_op &>(), "Copy constructor.")
       .def(
           "copy", [](const fermion_op &self) { return fermion_op(self); },
           "Creates a copy of the operator.")
@@ -156,9 +167,9 @@ void bindFermionOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -168,13 +179,13 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const fermion_op &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -187,9 +198,9 @@ void bindFermionOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -203,12 +214,12 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const fermion_op &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -222,7 +233,7 @@ void bindFermionOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &fermion_op::operator==, py::is_operator(),
+      .def("__eq__", &fermion_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -234,91 +245,91 @@ void bindFermionOperator(py::module &mod) {
           [](const fermion_op &self, const fermion_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= fermion_op_term(), py::is_operator())
-      .def(py::self += fermion_op_term(), py::is_operator())
-      .def(py::self -= fermion_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self += fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * fermion_op_term(), py::is_operator())
-      .def(py::self + fermion_op_term(), py::is_operator())
-      .def(py::self - fermion_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self + fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self - fermion_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -346,17 +357,17 @@ void bindFermionOperator(py::module &mod) {
       .def("dump", &fermion_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &fermion_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+      .def("trim", &fermion_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](fermion_op &self, double tol, const py::kwargs &kwargs) {
+          [](fermion_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -381,42 +392,44 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "__iter__",
           [](fermion_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<fermion_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &fermion_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &fermion_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &fermion_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &fermion_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &fermion_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &fermion_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &fermion_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &fermion_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &fermion_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &fermion_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_id", &fermion_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &fermion_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -424,30 +437,32 @@ void bindFermionOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
-      .def(py::init<double>(),
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return fermion_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<fermion_handler>(),
+      .def(
+          "__init__",
+          [](fermion_op_term *self, const scalar_operator &scalar) {
+            new (self) fermion_op_term(fermion_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<fermion_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<const fermion_op_term &, std::size_t>(),
-           py::arg("operator"), py::arg("size") = 0,
+      .def(nanobind::init<const fermion_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
@@ -459,7 +474,7 @@ void bindFermionOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &fermion_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -470,9 +485,9 @@ void bindFermionOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -482,13 +497,13 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const fermion_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -501,9 +516,9 @@ void bindFermionOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -517,12 +532,12 @@ void bindFermionOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const fermion_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -536,7 +551,7 @@ void bindFermionOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &fermion_op_term::operator==, py::is_operator(),
+      .def("__eq__", &fermion_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -548,77 +563,78 @@ void bindFermionOperator(py::module &mod) {
           [](const fermion_op_term &self, const fermion_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * fermion_op(), py::is_operator())
-      .def(py::self + fermion_op(), py::is_operator())
-      .def(py::self - fermion_op(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * fermion_op(), nanobind::is_operator())
+      .def(nanobind::self + fermion_op(), nanobind::is_operator())
+      .def(nanobind::self - fermion_op(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &fermion_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__",
           [](const fermion_op_term &self) { return self.to_string(); },
@@ -642,12 +658,12 @@ void bindFermionOperator(py::module &mod) {
           "of freedom that are not included in the given set.");
 }
 
-void bindFermionWrapper(py::module &mod) {
+void bindFermionWrapper(nanobind::module_ &mod) {
   bindFermionOperator(mod);
-  py::implicitly_convertible<double, fermion_op_term>();
-  py::implicitly_convertible<std::complex<double>, fermion_op_term>();
-  py::implicitly_convertible<scalar_operator, fermion_op_term>();
-  py::implicitly_convertible<fermion_op_term, fermion_op>();
+  nanobind::implicitly_convertible<double, fermion_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, fermion_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, fermion_op_term>();
+  nanobind::implicitly_convertible<fermion_op_term, fermion_op>();
   bindFermionModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_fermion_op.h b/python/runtime/cudaq/operators/py_fermion_op.h
index b54e406267a..45dbb8015d2 100644
--- a/python/runtime/cudaq/operators/py_fermion_op.h
+++ b/python/runtime/cudaq/operators/py_fermion_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of fermionic
 /// operators to python.
-void bindFermionWrapper(py::module &mod);
+void bindFermionWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_handlers.cpp b/python/runtime/cudaq/operators/py_handlers.cpp
index 71ca25a15aa..e8c2147e92b 100644
--- a/python/runtime/cudaq/operators/py_handlers.cpp
+++ b/python/runtime/cudaq/operators/py_handlers.cpp
@@ -7,11 +7,14 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "py_handlers.h"
@@ -19,53 +22,56 @@
 
 namespace cudaq {
 
-void bindPauli(py::module mod) {
-  py::enum_<pauli>(mod, "Pauli",
-                   "An enumeration representing the types of Pauli matrices.")
+void bindPauli(nanobind::module_ mod) {
+  nanobind::enum_<pauli>(
+      mod, "Pauli", "An enumeration representing the types of Pauli matrices.")
       .value("X", pauli::X)
       .value("Y", pauli::Y)
       .value("Z", pauli::Z)
       .value("I", pauli::I);
 }
 
-void bindOperatorHandlers(py::module &mod) {
+void bindOperatorHandlers(nanobind::module_ &mod) {
   using matrix_callback = std::function<complex_matrix(
       const std::vector<int64_t> &, const parameter_map &)>;
 
-  py::class_<matrix_handler>(mod, "MatrixOperatorElement")
-      .def_property_readonly(
+  nanobind::class_<matrix_handler>(mod, "MatrixOperatorElement")
+      .def_prop_ro(
           "id",
           [](const matrix_handler &self) { return self.to_string(false); },
           "Returns the id used to define and instantiate the operator.")
-      .def_property_readonly("degrees", &matrix_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("parameters",
-                             &matrix_handler::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("expected_dimensions",
-                             &matrix_handler::get_expected_dimensions,
-                             "The number of levels, that is the dimension, for "
-                             "each degree of freedom "
-                             "in canonical order that the operator acts on. A "
-                             "value of zero or less "
-                             "indicates that the operator is defined for any "
-                             "dimension of that degree.")
-      .def(py::init<std::size_t>(),
+      .def_prop_ro("degrees", &matrix_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("parameters", &matrix_handler::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("expected_dimensions",
+                   &matrix_handler::get_expected_dimensions,
+                   "The number of levels, that is the dimension, for "
+                   "each degree of freedom "
+                   "in canonical order that the operator acts on. A "
+                   "value of zero or less "
+                   "indicates that the operator is defined for any "
+                   "dimension of that degree.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init([](std::string operator_id,
-                       std::vector<std::size_t> degrees) {
-             return matrix_handler(std::move(operator_id), std::move(degrees));
-           }),
-           py::arg("id"), py::arg("degrees"),
-           "Creates the matrix operator with the given id acting on the given "
-           "degrees of "
-           "freedom. Throws a runtime exception if no operator with that id "
-           "has been defined.")
-      .def(py::init<const matrix_handler &>(), "Copy constructor.")
-      .def("__eq__", &matrix_handler::operator==, py::is_operator())
-      .def("to_string", &matrix_handler::to_string, py::arg("include_degrees"),
+      .def(
+          "__init__",
+          [](matrix_handler *self, std::string operator_id,
+             std::vector<std::size_t> degrees) {
+            new (self)
+                matrix_handler(std::move(operator_id), std::move(degrees));
+          },
+          nanobind::arg("id"), nanobind::arg("degrees"),
+          "Creates the matrix operator with the given id acting on the given "
+          "degrees of "
+          "freedom. Throws a runtime exception if no operator with that id "
+          "has been defined.")
+      .def(nanobind::init<const matrix_handler &>(), "Copy constructor.")
+      .def("__eq__", &matrix_handler::operator==, nanobind::is_operator())
+      .def("to_string", &matrix_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -74,18 +80,19 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const matrix_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.")
 
       // tools for custom operators
@@ -93,12 +100,12 @@ void bindOperatorHandlers(py::module &mod) {
           "_define",
           [](std::string operator_id, std::vector<int64_t> expected_dimensions,
              const matrix_callback &func, bool overwrite,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             // we need to make sure the python function that is stored in
             // the static dictionary containing the operator definitions
             // is properly cleaned up - otherwise python will hang on exit...
-            auto atexit = py::module_::import("atexit");
-            atexit.attr("register")(py::cpp_function([operator_id]() {
+            auto atexit = nanobind::module_::import_("atexit");
+            atexit.attr("register")(nanobind::cpp_function([operator_id]() {
               matrix_handler::remove_definition(operator_id);
             }));
             if (overwrite)
@@ -107,24 +114,25 @@ void bindOperatorHandlers(py::module &mod) {
                 std::move(operator_id), std::move(expected_dimensions), func,
                 details::kwargs_to_param_description(kwargs));
           },
-          py::arg("operator_id"), py::arg("expected_dimensions"),
-          py::arg("callback"), py::arg("overwrite") = false,
+          nanobind::arg("operator_id"), nanobind::arg("expected_dimensions"),
+          nanobind::arg("callback"), nanobind::arg("overwrite") = false,
+          nanobind::arg("kwargs"),
           "Defines a matrix operator with the given name and dimensions whose"
           "matrix representation can be obtained by invoking the given "
           "callback function.");
 
-  py::class_<boson_handler>(mod, "BosonOperatorElement")
-      .def_property_readonly(
-          "target", &boson_handler::target,
-          "Returns the degree of freedom that the operator targets.")
-      .def_property_readonly("degrees", &boson_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def(py::init<std::size_t>(),
+  nanobind::class_<boson_handler>(mod, "BosonOperatorElement")
+      .def_prop_ro("target", &boson_handler::target,
+                   "Returns the degree of freedom that the operator targets.")
+      .def_prop_ro("degrees", &boson_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init<const boson_handler &>(), "Copy constructor.")
-      .def("__eq__", &boson_handler::operator==, py::is_operator())
-      .def("to_string", &boson_handler::to_string, py::arg("include_degrees"),
+      .def(nanobind::init<const boson_handler &>(), "Copy constructor.")
+      .def("__eq__", &boson_handler::operator==, nanobind::is_operator())
+      .def("to_string", &boson_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -133,32 +141,33 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const boson_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 
-  py::class_<fermion_handler>(mod, "FermionOperatorElement")
-      .def_property_readonly(
-          "target", &fermion_handler::target,
-          "Returns the degree of freedom that the operator targets.")
-      .def_property_readonly("degrees", &fermion_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def(py::init<std::size_t>(),
+  nanobind::class_<fermion_handler>(mod, "FermionOperatorElement")
+      .def_prop_ro("target", &fermion_handler::target,
+                   "Returns the degree of freedom that the operator targets.")
+      .def_prop_ro("degrees", &fermion_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init<const fermion_handler &>(), "Copy constructor.")
-      .def("__eq__", &fermion_handler::operator==, py::is_operator())
-      .def("to_string", &fermion_handler::to_string, py::arg("include_degrees"),
+      .def(nanobind::init<const fermion_handler &>(), "Copy constructor.")
+      .def("__eq__", &fermion_handler::operator==, nanobind::is_operator())
+      .def("to_string", &fermion_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -167,34 +176,35 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const fermion_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 
-  py::class_<spin_handler>(mod, "SpinOperatorElement")
-      .def_property_readonly(
-          "target", &spin_handler::target,
-          "Returns the degree of freedom that the operator targets.")
-      .def_property_readonly("degrees", &spin_handler::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def(py::init<std::size_t>(),
+  nanobind::class_<spin_handler>(mod, "SpinOperatorElement")
+      .def_prop_ro("target", &spin_handler::target,
+                   "Returns the degree of freedom that the operator targets.")
+      .def_prop_ro("degrees", &spin_handler::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def(nanobind::init<std::size_t>(),
            "Creates an identity operator on the given target.")
-      .def(py::init<const spin_handler &>(), "Copy constructor.")
-      .def("__eq__", &spin_handler::operator==, py::is_operator())
+      .def(nanobind::init<const spin_handler &>(), "Copy constructor.")
+      .def("__eq__", &spin_handler::operator==, nanobind::is_operator())
       .def("as_pauli", &spin_handler::as_pauli,
            "Returns the Pauli representation of the operator.")
-      .def("to_string", &spin_handler::to_string, py::arg("include_degrees"),
+      .def("to_string", &spin_handler::to_string,
+           nanobind::arg("include_degrees"),
            "Returns the string representation of the operator.")
       .def(
           "to_matrix",
@@ -203,22 +213,23 @@ void bindOperatorHandlers(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
           "Returns the matrix representation of the operator.")
       .def(
           "to_matrix",
           [](const spin_handler &self, dimension_map &dimensions,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(dimensions,
                                        details::kwargs_to_param_map(kwargs));
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator.");
 }
 
-void bindHandlersWrapper(py::module &mod) {
+void bindHandlersWrapper(nanobind::module_ &mod) {
   bindPauli(mod);
   bindOperatorHandlers(mod);
 }
diff --git a/python/runtime/cudaq/operators/py_handlers.h b/python/runtime/cudaq/operators/py_handlers.h
index 3bcde5ad205..cd82dd92e44 100644
--- a/python/runtime/cudaq/operators/py_handlers.h
+++ b/python/runtime/cudaq/operators/py_handlers.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of
 /// operator handlers to python.
-void bindHandlersWrapper(py::module &mod);
+void bindHandlersWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_helpers.cpp b/python/runtime/cudaq/operators/py_helpers.cpp
index aecc5811074..b34212bce6e 100644
--- a/python/runtime/cudaq/operators/py_helpers.cpp
+++ b/python/runtime/cudaq/operators/py_helpers.cpp
@@ -8,18 +8,19 @@
 
 #include "py_helpers.h"
 #include "cudaq/operators.h"
+#include <algorithm>
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
 
 namespace cudaq::details {
 
-cudaq::parameter_map kwargs_to_param_map(const py::kwargs &kwargs) {
+cudaq::parameter_map kwargs_to_param_map(const nanobind::kwargs &kwargs) {
   cudaq::parameter_map params;
-  for (auto &[keyPy, valuePy] : kwargs) {
-    std::string key = py::str(keyPy);
-    std::complex<double> value = valuePy.cast<std::complex<double>>();
+  for (auto [keyPy, valuePy] : kwargs) {
+    std::string key = nanobind::str(keyPy).c_str();
+    std::complex<double> value = nanobind::cast<std::complex<double>>(valuePy);
     params.insert(params.end(),
                   std::pair<std::string, std::complex<double>>(key, value));
   }
@@ -27,29 +28,34 @@ cudaq::parameter_map kwargs_to_param_map(const py::kwargs &kwargs) {
 }
 
 std::unordered_map<std::string, std::string>
-kwargs_to_param_description(const py::kwargs &kwargs) {
+kwargs_to_param_description(const nanobind::kwargs &kwargs) {
   std::unordered_map<std::string, std::string> param_desc;
-  for (auto &[keyPy, valuePy] : kwargs) {
-    std::string key = py::str(keyPy);
-    std::string value = py::str(valuePy);
+  for (auto [keyPy, valuePy] : kwargs) {
+    std::string key = nanobind::str(keyPy).c_str();
+    std::string value = nanobind::str(valuePy).c_str();
     param_desc.insert(param_desc.end(),
                       std::pair<std::string, std::string>(key, value));
   }
   return param_desc;
 }
 
-py::array_t<std::complex<double>> cmat_to_numpy(complex_matrix &cmat) {
+nanobind::ndarray<nanobind::numpy, std::complex<double>>
+cmat_to_numpy(complex_matrix &cmat) {
   auto rows = cmat.rows();
   auto cols = cmat.cols();
-  auto data = cmat.get_data(complex_matrix::order::row_major);
-  std::vector<ssize_t> shape = {static_cast<ssize_t>(rows),
-                                static_cast<ssize_t>(cols)};
-  std::vector<ssize_t> strides = {
-      static_cast<ssize_t>(sizeof(std::complex<double>) * cols),
-      static_cast<ssize_t>(sizeof(std::complex<double>))};
-
-  // Return a numpy array without copying data
-  return py::array_t<std::complex<double>>(shape, strides, data);
-};
+  auto *src = cmat.get_data(complex_matrix::order::row_major);
+  std::size_t n = rows * cols;
+  std::size_t shape[2] = {rows, cols};
+
+  auto *copy = new std::complex<double>[n];
+  std::copy(src, src + n, copy);
+
+  nanobind::capsule owner(copy, [](void *p) noexcept {
+    delete[] static_cast<std::complex<double> *>(p);
+  });
+
+  return nanobind::ndarray<nanobind::numpy, std::complex<double>>(copy, 2,
+                                                                  shape, owner);
+}
 
 } // namespace cudaq::details
diff --git a/python/runtime/cudaq/operators/py_helpers.h b/python/runtime/cudaq/operators/py_helpers.h
index 33b7463ae9a..e712281784f 100644
--- a/python/runtime/cudaq/operators/py_helpers.h
+++ b/python/runtime/cudaq/operators/py_helpers.h
@@ -7,14 +7,13 @@
  ******************************************************************************/
 
 #include "cudaq/operators.h"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 
 namespace cudaq::details {
-cudaq::parameter_map kwargs_to_param_map(const py::kwargs &kwargs);
+cudaq::parameter_map kwargs_to_param_map(const nanobind::kwargs &kwargs);
 std::unordered_map<std::string, std::string>
-kwargs_to_param_description(const py::kwargs &kwargs);
-py::array_t<std::complex<double>> cmat_to_numpy(complex_matrix &cmat);
+kwargs_to_param_description(const nanobind::kwargs &kwargs);
+nanobind::ndarray<nanobind::numpy, std::complex<double>>
+cmat_to_numpy(complex_matrix &cmat);
 } // namespace cudaq::details
diff --git a/python/runtime/cudaq/operators/py_matrix.cpp b/python/runtime/cudaq/operators/py_matrix.cpp
index 6b2828c2973..48d37891e7f 100644
--- a/python/runtime/cudaq/operators/py_matrix.cpp
+++ b/python/runtime/cudaq/operators/py_matrix.cpp
@@ -6,10 +6,12 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators/matrix.h"
 #include "py_helpers.h"
@@ -19,44 +21,25 @@
 
 namespace cudaq {
 
-/// @brief Extract the array data from a buffer_info into our
-/// own allocated data pointer.
-void extractMatrixData(py::buffer_info &info, std::complex<double> *data) {
-  if (info.format != py::format_descriptor<std::complex<double>>::format())
-    throw std::runtime_error(
-        "Incompatible buffer format, must be np.complex128.");
-
-  if (info.ndim != 2)
-    throw std::runtime_error("Incompatible buffer shape.");
-
-  memcpy(data, info.ptr,
-         sizeof(std::complex<double>) * (info.shape[0] * info.shape[1]));
-}
-
-void bindComplexMatrix(py::module &mod) {
-  py::class_<complex_matrix>(
-      mod, "ComplexMatrix", py::buffer_protocol(),
+void bindComplexMatrix(nanobind::module_ &mod) {
+  nanobind::class_<complex_matrix>(
+      mod, "ComplexMatrix",
       "The :class:`ComplexMatrix` is a thin wrapper around a "
       "matrix of complex<double> elements.")
-      /// The following makes this fully compatible with NumPy
-      .def_buffer([](complex_matrix &op) -> py::buffer_info {
-        return py::buffer_info(
-            op.get_data(complex_matrix::order::row_major),
-            sizeof(std::complex<double>),
-            py::format_descriptor<std::complex<double>>::format(), 2,
-            {op.rows(), op.cols()},
-            {sizeof(std::complex<double>) * op.cols(),
-             sizeof(std::complex<double>)});
-      })
-      .def(py::init([](const py::buffer &b) {
-             py::buffer_info info = b.request();
-             complex_matrix m(info.shape[0], info.shape[1]);
-             extractMatrixData(info,
-                               m.get_data(complex_matrix::order::row_major));
-             return m;
-           }),
-           "Create a :class:`ComplexMatrix` from a buffer of data, such as a "
-           "numpy.ndarray.")
+      .def(
+          "__init__",
+          [](complex_matrix *self,
+             nanobind::ndarray<std::complex<double>, nanobind::ndim<2>,
+                               nanobind::c_contig, nanobind::numpy>
+                 arr) {
+            auto rows = arr.shape(0);
+            auto cols = arr.shape(1);
+            new (self) complex_matrix(rows, cols);
+            memcpy(self->get_data(complex_matrix::order::row_major), arr.data(),
+                   sizeof(std::complex<double>) * rows * cols);
+          },
+          "Create a :class:`ComplexMatrix` from a buffer of data, such as a "
+          "numpy.ndarray.")
       .def(
           "num_rows", [](complex_matrix &m) { return m.rows(); },
           "Returns the number of rows in the matrix.")
@@ -85,7 +68,7 @@ void bindComplexMatrix(py::module &mod) {
           [](const complex_matrix &lhs, const complex_matrix &rhs) {
             return lhs == rhs;
           },
-          py::is_operator())
+          nanobind::is_operator())
       .def("__str__", &complex_matrix::to_string,
            "Returns the string representation of the matrix.")
       .def(
diff --git a/python/runtime/cudaq/operators/py_matrix.h b/python/runtime/cudaq/operators/py_matrix.h
index 022a74fdbf8..baf93260e9e 100644
--- a/python/runtime/cudaq/operators/py_matrix.h
+++ b/python/runtime/cudaq/operators/py_matrix.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of `cudaq::complex_matrix`
 /// to python.
-void bindComplexMatrix(py::module &mod);
+void bindComplexMatrix(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_matrix_op.cpp b/python/runtime/cudaq/operators/py_matrix_op.cpp
index 187ab99f746..3883f86c9bd 100644
--- a/python/runtime/cudaq/operators/py_matrix_op.cpp
+++ b/python/runtime/cudaq/operators/py_matrix_op.cpp
@@ -7,10 +7,15 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -19,7 +24,7 @@
 
 namespace cudaq {
 
-void bindOperatorsModule(py::module &mod) {
+void bindOperatorsModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::operators` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto operators_submodule = mod.def_submodule("operators");
@@ -33,34 +38,34 @@ void bindOperatorsModule(py::module &mod) {
   operators_submodule.def(
       "identity",
       [](std::size_t target) { return matrix_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   operators_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return matrix_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
   operators_submodule.def(
-      "number", &matrix_op::number<matrix_handler>, py::arg("target"),
+      "number", &matrix_op::number<matrix_handler>, nanobind::arg("target"),
       "Returns a number operator on the given target index.");
   operators_submodule.def(
-      "parity", &matrix_op::parity<matrix_handler>, py::arg("target"),
+      "parity", &matrix_op::parity<matrix_handler>, nanobind::arg("target"),
       "Returns a parity operator on the given target index.");
   operators_submodule.def(
-      "position", &matrix_op::position<matrix_handler>, py::arg("target"),
+      "position", &matrix_op::position<matrix_handler>, nanobind::arg("target"),
       "Returns a position operator on the given target index.");
   operators_submodule.def(
-      "momentum", &matrix_op::momentum<matrix_handler>, py::arg("target"),
+      "momentum", &matrix_op::momentum<matrix_handler>, nanobind::arg("target"),
       "Returns a momentum operator on the given target index.");
   operators_submodule.def(
-      "squeeze", &matrix_op::squeeze<matrix_handler>, py::arg("target"),
+      "squeeze", &matrix_op::squeeze<matrix_handler>, nanobind::arg("target"),
       "Returns a squeezing operator on the given target index.");
   operators_submodule.def(
-      "displace", &matrix_op::displace<matrix_handler>, py::arg("target"),
+      "displace", &matrix_op::displace<matrix_handler>, nanobind::arg("target"),
       "Returns a displacement operator on the given target index.");
   operators_submodule.def(
       "canonicalized",
@@ -94,41 +99,43 @@ void bindOperatorsModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindMatrixOperator(py::module &mod) {
+void bindMatrixOperator(nanobind::module_ &mod) {
 
-  auto matrix_op_class = py::class_<matrix_op>(mod, "MatrixOperator");
+  auto matrix_op_class = nanobind::class_<matrix_op>(mod, "MatrixOperator");
   auto matrix_op_term_class =
-      py::class_<matrix_op_term>(mod, "MatrixOperatorTerm");
+      nanobind::class_<matrix_op_term>(mod, "MatrixOperatorTerm");
 
   matrix_op_class
       .def(
           "__iter__",
           [](matrix_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<matrix_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &matrix_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &matrix_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("min_degree", &matrix_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &matrix_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &matrix_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &matrix_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &matrix_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("min_degree", &matrix_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &matrix_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &matrix_op::num_terms,
+                   "Returns the number of terms in the operator.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -137,15 +144,15 @@ void bindMatrixOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(),
+      .def(nanobind::init<std::size_t>(),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms.")
-      .def(py::init<spin_op>())
-      .def(py::init<fermion_op>())
-      .def(py::init<boson_op>())
-      .def(py::init<const matrix_op_term &>(),
+      .def(nanobind::init<spin_op>())
+      .def(nanobind::init<fermion_op>())
+      .def(nanobind::init<boson_op>())
+      .def(nanobind::init<const matrix_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const matrix_op &>(), "Copy constructor.")
+      .def(nanobind::init<const matrix_op &>(), "Copy constructor.")
       .def(
           "copy", [](const matrix_op &self) { return matrix_op(self); },
           "Creates a copy of the operator.")
@@ -159,9 +166,9 @@ void bindMatrixOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -172,13 +179,13 @@ void bindMatrixOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const matrix_op &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -188,7 +195,7 @@ void bindMatrixOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &matrix_op::operator==, py::is_operator(),
+      .def("__eq__", &matrix_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "into account that addition is commutative and so is multiplication "
@@ -202,85 +209,85 @@ void bindMatrixOperator(py::module &mod) {
           [](const matrix_op &self, const matrix_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= matrix_op_term(), py::is_operator())
-      .def(py::self += matrix_op_term(), py::is_operator())
-      .def(py::self -= matrix_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self += matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -308,17 +315,17 @@ void bindMatrixOperator(py::module &mod) {
       .def("dump", &matrix_op::dump,
            "Prints the string representation of the operator to the standard "
            "output.")
-      .def("trim", &matrix_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+      .def("trim", &matrix_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](matrix_op &self, double tol, const py::kwargs &kwargs) {
+          [](matrix_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -343,42 +350,44 @@ void bindMatrixOperator(py::module &mod) {
       .def(
           "__iter__",
           [](matrix_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<matrix_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &matrix_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &matrix_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &matrix_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &matrix_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &matrix_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &matrix_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &matrix_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &matrix_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &matrix_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &matrix_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_id", &matrix_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &matrix_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -386,33 +395,35 @@ void bindMatrixOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
-      .def(py::init<double>(),
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return matrix_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<matrix_handler>(),
+      .def(
+          "__init__",
+          [](matrix_op_term *self, const scalar_operator &scalar) {
+            new (self) matrix_op_term(matrix_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<matrix_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<spin_op_term>())
-      .def(py::init<fermion_op_term>())
-      .def(py::init<boson_op_term>())
-      .def(py::init<const matrix_op_term &, std::size_t>(), py::arg("operator"),
-           py::arg("size") = 0,
+      .def(nanobind::init<spin_op_term>())
+      .def(nanobind::init<fermion_op_term>())
+      .def(nanobind::init<boson_op_term>())
+      .def(nanobind::init<const matrix_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
@@ -424,7 +435,7 @@ void bindMatrixOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &matrix_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -435,9 +446,9 @@ void bindMatrixOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -447,13 +458,13 @@ void bindMatrixOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const matrix_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -463,7 +474,7 @@ void bindMatrixOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &matrix_op_term::operator==, py::is_operator(),
+      .def("__eq__", &matrix_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "into account that multiplication of operators that act on "
@@ -476,71 +487,72 @@ void bindMatrixOperator(py::module &mod) {
           [](const matrix_op_term &self, const matrix_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &matrix_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__",
           [](const matrix_op_term &self) { return self.to_string(); },
@@ -564,18 +576,18 @@ void bindMatrixOperator(py::module &mod) {
           "of freedom that are not included in the given set.");
 }
 
-void bindOperatorsWrapper(py::module &mod) {
+void bindOperatorsWrapper(nanobind::module_ &mod) {
   bindMatrixOperator(mod);
-  py::implicitly_convertible<double, matrix_op_term>();
-  py::implicitly_convertible<std::complex<double>, matrix_op_term>();
-  py::implicitly_convertible<scalar_operator, matrix_op_term>();
-  py::implicitly_convertible<spin_op_term, matrix_op_term>();
-  py::implicitly_convertible<spin_op, matrix_op>();
-  py::implicitly_convertible<boson_op_term, matrix_op_term>();
-  py::implicitly_convertible<boson_op, matrix_op>();
-  py::implicitly_convertible<fermion_op_term, matrix_op_term>();
-  py::implicitly_convertible<fermion_op, matrix_op>();
-  py::implicitly_convertible<matrix_op_term, matrix_op>();
+  nanobind::implicitly_convertible<double, matrix_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, matrix_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, matrix_op_term>();
+  nanobind::implicitly_convertible<spin_op_term, matrix_op_term>();
+  nanobind::implicitly_convertible<spin_op, matrix_op>();
+  nanobind::implicitly_convertible<boson_op_term, matrix_op_term>();
+  nanobind::implicitly_convertible<boson_op, matrix_op>();
+  nanobind::implicitly_convertible<fermion_op_term, matrix_op_term>();
+  nanobind::implicitly_convertible<fermion_op, matrix_op>();
+  nanobind::implicitly_convertible<matrix_op_term, matrix_op>();
   bindOperatorsModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_matrix_op.h b/python/runtime/cudaq/operators/py_matrix_op.h
index 97b154b720f..28df05d8efb 100644
--- a/python/runtime/cudaq/operators/py_matrix_op.h
+++ b/python/runtime/cudaq/operators/py_matrix_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of matrix
 /// operators to python.
-void bindOperatorsWrapper(py::module &mod);
+void bindOperatorsWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_scalar_op.cpp b/python/runtime/cudaq/operators/py_scalar_op.cpp
index fc92a836551..1ed437dc316 100644
--- a/python/runtime/cudaq/operators/py_scalar_op.cpp
+++ b/python/runtime/cudaq/operators/py_scalar_op.cpp
@@ -10,11 +10,15 @@
 #include <functional>
 #include <unordered_map>
 
-#include <pybind11/complex.h>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -23,47 +27,127 @@
 
 namespace cudaq {
 
-void bindScalarOperator(py::module &mod) {
-  using scalar_callback =
-      std::function<std::complex<double>(const parameter_map &)>;
+namespace {
+
+std::pair<std::unordered_map<std::string, std::string>, bool>
+introspectCallable(const nanobind::callable &func) {
+  nanobind::module_ inspect = nanobind::module_::import_("inspect");
+  nanobind::object argSpec = inspect.attr("getfullargspec")(func);
+
+  if (!argSpec.attr("varargs").is_none())
+    throw std::invalid_argument(
+        "the function defining a scalar operator must not take *args");
+
+  nanobind::module_ helpers =
+      nanobind::module_::import_("cudaq.operators.helpers");
+  nanobind::object paramDocsFn = helpers.attr("_parameter_docs");
+  nanobind::object docstring = func.attr("__doc__");
+
+  std::unordered_map<std::string, std::string> paramDesc;
+  for (nanobind::handle name : argSpec.attr("args")) {
+    std::string n = nanobind::cast<std::string>(name);
+    std::string doc = nanobind::cast<std::string>(
+        paramDocsFn(nanobind::str(n.c_str()), docstring));
+    paramDesc[n] = doc;
+  }
+  for (nanobind::handle name : argSpec.attr("kwonlyargs")) {
+    std::string n = nanobind::cast<std::string>(name);
+    std::string doc = nanobind::cast<std::string>(
+        paramDocsFn(nanobind::str(n.c_str()), docstring));
+    paramDesc[n] = doc;
+  }
+
+  bool acceptsKwargs = !argSpec.attr("varkw").is_none();
+  return {std::move(paramDesc), acceptsKwargs};
+}
+
+scalar_callback wrapPythonCallable(nanobind::callable func,
+                                   const std::vector<std::string> &paramNames,
+                                   bool acceptsKwargs) {
+  return [func = std::move(func), paramNames,
+          acceptsKwargs](const parameter_map &params) -> std::complex<double> {
+    nanobind::gil_scoped_acquire guard;
+    nanobind::dict pyKwargs;
+    if (acceptsKwargs) {
+      for (const auto &[k, v] : params)
+        pyKwargs[k.c_str()] = nanobind::cast(v);
+    } else {
+      for (const auto &name : paramNames) {
+        auto it = params.find(name);
+        if (it != params.end())
+          pyKwargs[name.c_str()] = nanobind::cast(it->second);
+      }
+    }
+    nanobind::object result = func(**pyKwargs);
+    return nanobind::cast<std::complex<double>>(result);
+  };
+}
+
+} // anonymous namespace
+
+void bindScalarOperator(nanobind::module_ &mod) {
 
-  py::class_<scalar_operator>(mod, "ScalarOperator")
+  nanobind::class_<scalar_operator>(mod, "ScalarOperator")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &scalar_operator::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
+      .def_prop_ro("parameters", &scalar_operator::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
 
       // constructors
 
-      .def(py::init<>(), "Creates a scalar operator with constant value 1.")
-      .def(py::init<double>(),
+      .def(nanobind::init<>(),
+           "Creates a scalar operator with constant value 1.")
+      .def(nanobind::init<double>(),
            "Creates a scalar operator with the given constant value.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a scalar operator with the given constant value.")
-      .def(py::init([](const scalar_callback &func, const py::kwargs &kwargs) {
-             return scalar_operator(
-                 func, details::kwargs_to_param_description(kwargs));
-           }),
-           py::arg("callback"),
-           "Creates a scalar operator where the given callback function is "
-           "invoked during evaluation.")
-      .def(py::init<const scalar_operator &>(), "Copy constructor.")
+      .def(
+          "__init__",
+          [](scalar_operator *self, nanobind::callable func) {
+            auto [paramDesc, acceptsKwargs] = introspectCallable(func);
+            std::vector<std::string> paramNames;
+            for (const auto &[k, v] : paramDesc)
+              paramNames.push_back(k);
+            auto callback =
+                wrapPythonCallable(std::move(func), paramNames, acceptsKwargs);
+            new (self)
+                scalar_operator(std::move(callback), std::move(paramDesc));
+          },
+          nanobind::arg("generator"),
+          "Creates a scalar operator from a callable. Parameter names are "
+          "introspected from the function signature.")
+      .def(
+          "__init__",
+          [](scalar_operator *self, nanobind::callable func,
+             const nanobind::kwargs &kwargs) {
+            auto [introspected, acceptsKwargs] = introspectCallable(func);
+            auto paramDesc = details::kwargs_to_param_description(kwargs);
+            std::vector<std::string> paramNames;
+            for (const auto &[k, v] : paramDesc)
+              paramNames.push_back(k);
+            auto callback =
+                wrapPythonCallable(std::move(func), paramNames, acceptsKwargs);
+            new (self)
+                scalar_operator(std::move(callback), std::move(paramDesc));
+          },
+          "Creates a scalar operator from a callable with keyword argument "
+          "parameter descriptions.")
+      .def(nanobind::init<const scalar_operator &>(), "Copy constructor.")
 
       // evaluations
 
       .def(
           "evaluate",
-          [](const scalar_operator &self, const py::kwargs &kwargs) {
+          [](const scalar_operator &self, const nanobind::kwargs &kwargs) {
             return self.evaluate(details::kwargs_to_param_map(kwargs));
           },
           "Evaluated value of the operator.")
 
       // comparisons
 
-      .def("__eq__", &scalar_operator::operator==, py::is_operator())
+      .def("__eq__", &scalar_operator::operator==, nanobind::is_operator())
 
       // general utility functions
 
@@ -73,10 +157,10 @@ void bindScalarOperator(py::module &mod) {
            "Returns the string representation of the operator.");
 }
 
-void bindScalarWrapper(py::module &mod) {
+void bindScalarWrapper(nanobind::module_ &mod) {
   bindScalarOperator(mod);
-  py::implicitly_convertible<double, scalar_operator>();
-  py::implicitly_convertible<std::complex<double>, scalar_operator>();
+  nanobind::implicitly_convertible<double, scalar_operator>();
+  nanobind::implicitly_convertible<std::complex<double>, scalar_operator>();
 }
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_scalar_op.h b/python/runtime/cudaq/operators/py_scalar_op.h
index 046f23411c6..4197132a60c 100644
--- a/python/runtime/cudaq/operators/py_scalar_op.h
+++ b/python/runtime/cudaq/operators/py_scalar_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of `cudaq::spin`
 /// and `cudaq::spin_op` to python.
-void bindScalarWrapper(py::module &mod);
+void bindScalarWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_spin_op.cpp b/python/runtime/cudaq/operators/py_spin_op.cpp
index 0e35fb1e0d9..e901dcac0cd 100644
--- a/python/runtime/cudaq/operators/py_spin_op.cpp
+++ b/python/runtime/cudaq/operators/py_spin_op.cpp
@@ -7,10 +7,18 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "cudaq/operators/serialization.h"
@@ -20,8 +28,8 @@
 namespace cudaq {
 
 /// @brief Map an OpenFermion operator to our own spin operator
-spin_op fromOpenFermionQubitOperator(py::object &op) {
-  if (!py::hasattr(op, "terms"))
+spin_op fromOpenFermionQubitOperator(nanobind::object &op) {
+  if (!nanobind::hasattr(op, "terms"))
     throw std::runtime_error(
         "This is not an openfermion operator, must have 'terms' attribute.");
   std::map<std::string, std::function<spin_op_term(std::size_t)>> creatorMap{
@@ -31,18 +39,19 @@ spin_op fromOpenFermionQubitOperator(py::object &op) {
   auto terms = op.attr("terms");
   auto H = spin_op::empty();
   for (auto term : terms) {
-    auto termTuple = term.cast<py::tuple>();
+    auto termTuple = nanobind::cast<nanobind::tuple>(term);
     auto localTerm = spin_op::identity();
-    for (auto &element : termTuple) {
-      auto casted = element.cast<std::pair<std::size_t, std::string>>();
+    for (auto element : termTuple) {
+      auto casted =
+          nanobind::cast<std::pair<std::size_t, std::string>>(element);
       localTerm *= creatorMap[casted.second](casted.first);
     }
-    H += terms[term].cast<double>() * localTerm;
+    H += nanobind::cast<double>(terms[term]) * localTerm;
   }
   return H;
 }
 
-void bindSpinModule(py::module &mod) {
+void bindSpinModule(nanobind::module_ &mod) {
   // Binding the functions in `cudaq::spin` as `_pycudaq` submodule
   // so it's accessible directly in the cudaq namespace.
   auto spin_submodule = mod.def_submodule("spin");
@@ -56,33 +65,35 @@ void bindSpinModule(py::module &mod) {
   // here for consistency with other operators
   spin_submodule.def(
       "identity", [](std::size_t target) { return spin_op::identity(target); },
-      py::arg("target"),
+      nanobind::arg("target"),
       "Returns an identity operator on the given target index.");
   spin_submodule.def(
       "identities",
       [](std::size_t first, std::size_t last) {
         return spin_op_term(first, last);
       },
-      py::arg("first"), py::arg("last"),
+      nanobind::arg("first"), nanobind::arg("last"),
       "Creates a product operator that applies an identity operation to all "
       "degrees of "
       "freedom in the open range [first, last).");
-  spin_submodule.def("i", &spin_op::i<spin_handler>, py::arg("target"),
+  spin_submodule.def("i", &spin_op::i<spin_handler>, nanobind::arg("target"),
                      "Returns a Pauli I spin operator on the given "
                      "target qubit index.");
   spin_submodule.def(
-      "x", &spin_op::x<spin_handler>, py::arg("target"),
+      "x", &spin_op::x<spin_handler>, nanobind::arg("target"),
       "Returns a Pauli X spin operator on the given target qubit index.");
   spin_submodule.def(
-      "y", &spin_op::y<spin_handler>, py::arg("target"),
+      "y", &spin_op::y<spin_handler>, nanobind::arg("target"),
       "Returns a Pauli Y spin operator on the given target qubit index.");
   spin_submodule.def(
-      "z", &spin_op::z<spin_handler>, py::arg("target"),
+      "z", &spin_op::z<spin_handler>, nanobind::arg("target"),
       "Returns a Pauli Z spin operator on the given target qubit index.");
-  spin_submodule.def("plus", &spin_op::plus<spin_handler>, py::arg("target"),
+  spin_submodule.def("plus", &spin_op::plus<spin_handler>,
+                     nanobind::arg("target"),
                      "Return a sigma plus spin operator on the given "
                      "target qubit index.");
-  spin_submodule.def("minus", &spin_op::minus<spin_handler>, py::arg("target"),
+  spin_submodule.def("minus", &spin_op::minus<spin_handler>,
+                     nanobind::arg("target"),
                      "Return a sigma minus spin operator on the given "
                      "target qubit index.");
   spin_submodule.def(
@@ -115,52 +126,55 @@ void bindSpinModule(py::module &mod) {
       "degrees of freedom.");
 }
 
-void bindSpinOperator(py::module &mod) {
+void bindSpinOperator(nanobind::module_ &mod) {
 
-  auto spin_op_class = py::class_<spin_op>(mod, "SpinOperator");
-  auto spin_op_term_class = py::class_<spin_op_term>(mod, "SpinOperatorTerm");
+  auto spin_op_class = nanobind::class_<spin_op>(mod, "SpinOperator");
+  auto spin_op_term_class =
+      nanobind::class_<spin_op_term>(mod, "SpinOperatorTerm");
 
   spin_op_class
       .def(
           "__iter__",
           [](spin_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<spin_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters", &spin_op::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &spin_op::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &spin_op::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &spin_op::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("term_count", &spin_op::num_terms,
-                             "Returns the number of terms in the operator.")
+      .def_prop_ro("parameters", &spin_op::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &spin_op::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &spin_op::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &spin_op::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("term_count", &spin_op::num_terms,
+                   "Returns the number of terms in the operator.")
       // only exists for spin operators
-      .def_property_readonly(
-          "qubit_count", &spin_op::num_qubits<spin_handler>,
-          "Return the number of qubits this operator acts on.")
+      .def_prop_ro("qubit_count", &spin_op::num_qubits<spin_handler>,
+                   "Return the number of qubits this operator acts on.")
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a default instantiated sum. A default instantiated "
            "sum has no value; it will take a value the first time an "
            "arithmetic operation "
@@ -169,27 +183,31 @@ void bindSpinOperator(py::module &mod) {
            "identity. To construct a `0` value in the mathematical sense "
            "(neutral element "
            "for addition), use `empty()` instead.")
-      .def(py::init<std::size_t>(), py::arg("size"),
+      .def(nanobind::init<std::size_t>(), nanobind::arg("size"),
            "Creates a sum operator with no terms, reserving "
            "space for the given number of terms (size).")
       // NOTE: only supported on spin ops so far
-      .def(py::init<std::vector<double> &>(), py::arg("data"),
+      .def(nanobind::init<std::vector<double> &>(), nanobind::arg("data"),
            "Creates an operator based on a serialized data representation.")
       // NOTE: only supported on spin ops so far
-      .def(py::init([](const std::string &fileName) {
-             binary_spin_op_reader reader;
-             return reader.read(fileName);
-           }),
-           "Creates an operator based on a serialized data representation in "
-           "the given file.")
-      .def(py::init<const spin_op_term &>(),
+      .def(
+          "__init__",
+          [](spin_op *self, const std::string &fileName) {
+            binary_spin_op_reader reader;
+            new (self) spin_op(reader.read(fileName));
+          },
+          "Creates an operator based on a serialized data representation in "
+          "the given file.")
+      .def(nanobind::init<const spin_op_term &>(),
            "Creates a sum operator with the given term.")
-      .def(py::init<const spin_op &>(), "Copy constructor.")
+      .def(nanobind::init<const spin_op &>(), "Copy constructor.")
       // NOTE: only supported on spin ops
-      .def(py::init([](py::object obj) {
-             return fromOpenFermionQubitOperator(obj);
-           }),
-           "Convert an OpenFermion operator to a CUDA-Q spin operator.")
+      .def(
+          "__init__",
+          [](spin_op *self, nanobind::object obj) {
+            new (self) spin_op(fromOpenFermionQubitOperator(obj));
+          },
+          "Convert an OpenFermion operator to a CUDA-Q spin operator.")
       .def(
           "copy", [](const spin_op &self) { return spin_op(self); },
           "Creates a copy of the operator.")
@@ -200,15 +218,16 @@ void bindSpinOperator(py::module &mod) {
       .def_static(
           "from_json",
           [](const std::string &json_str) {
-            py::object json = py::module_::import("json");
-            auto data = py::list(json.attr("loads")(json_str));
-            return spin_op(data.cast<std::vector<double>>());
+            nanobind::object json = nanobind::module_::import_("json");
+            auto data = nanobind::list(json.attr("loads")(json_str));
+            return spin_op(nanobind::cast<std::vector<double>>(data));
           },
           "Convert JSON string ('[d1, d2, d3, ...]') to spin_op")
       // NOTE: only supported on spin ops
       .def_static(
-          "random", &spin_op::random<spin_handler>, py::arg("qubit_count"),
-          py::arg("term_count"), py::arg("seed") = std::random_device{}(),
+          "random", &spin_op::random<spin_handler>,
+          nanobind::arg("qubit_count"), nanobind::arg("term_count"),
+          nanobind::arg("seed") = std::random_device{}(),
           "Return a random spin operator with the given number of terms "
           "(`term_count`) where each term acts on all targets in the open "
           "range "
@@ -223,9 +242,9 @@ void bindSpinOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -235,13 +254,13 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const spin_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -254,9 +273,9 @@ void bindSpinOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -270,12 +289,12 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const spin_op &self, dimension_map &dimensions, bool invert_order,
-             const py::kwargs &kwargs) {
+             const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -289,7 +308,7 @@ void bindSpinOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &spin_op::operator==, py::is_operator(),
+      .def("__eq__", &spin_op::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -301,91 +320,91 @@ void bindSpinOperator(py::module &mod) {
           [](const spin_op &self, const spin_op_term &other) {
             return self.num_terms() == 1 && *self.begin() == other;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self += int(), py::is_operator())
-      .def(py::self -= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self += double(), py::is_operator())
-      .def(py::self -= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self += std::complex<double>(), py::is_operator())
-      .def(py::self -= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self += scalar_operator(), py::is_operator())
-      .def(py::self -= scalar_operator(), py::is_operator())
-      .def(py::self *= spin_op_term(), py::is_operator())
-      .def(py::self += spin_op_term(), py::is_operator())
-      .def(py::self -= spin_op_term(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
-      .def(py::self += py::self, py::is_operator())
-// see issue https://github.com/pybind/pybind11/issues/1893
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self += int(), nanobind::is_operator())
+      .def(nanobind::self -= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self += double(), nanobind::is_operator())
+      .def(nanobind::self -= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self += std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self -= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self += scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self -= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self += spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self -= spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
+      .def(nanobind::self += nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wself-assign-overloaded"
 #endif
-      .def(py::self -= py::self, py::is_operator())
+      .def(nanobind::self -= nanobind::self, nanobind::is_operator())
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * spin_op_term(), py::is_operator())
-      .def(py::self + spin_op_term(), py::is_operator())
-      .def(py::self - spin_op_term(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self + spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self - spin_op_term(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // common operators
 
@@ -420,22 +439,22 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_json",
           [](const spin_op &self) {
-            py::object json = py::module_::import("json");
+            nanobind::object json = nanobind::module_::import_("json");
             auto data = self.get_data_representation();
             return json.attr("dumps")(data);
           },
-          "Convert spin_op to JSON string: '[d1, d2, d3, ...]'")
-      .def("trim", &spin_op::trim, py::arg("tol") = 0.0,
-           py::arg("parameters") = parameter_map(),
+          "Convert spin_op to a JSON string, e.g., '[d1, d2, d3, ...]'.")
+      .def("trim", &spin_op::trim, nanobind::arg("tol") = 0.0,
+           nanobind::arg("parameters") = parameter_map(),
            "Removes all terms from the sum for which the absolute value of the "
            "coefficient is below "
            "the given tolerance.")
       .def(
           "trim",
-          [](spin_op &self, double tol, const py::kwargs &kwargs) {
+          [](spin_op &self, double tol, const nanobind::kwargs &kwargs) {
             return self.trim(tol, details::kwargs_to_param_map(kwargs));
           },
-          py::arg("tol") = 0.0,
+          nanobind::arg("tol") = 0.0, nanobind::arg("kwargs"),
           "Removes all terms from the sum for which the absolute value of the "
           "coefficient is below "
           "the given tolerance.")
@@ -522,33 +541,37 @@ void bindSpinOperator(py::module &mod) {
           "(product operator) in future releases.")
       // constructor for old serialization format
       .def(
-          py::init([](const std::vector<double> &data, std::size_t num_qubits) {
+          "__init__",
+          [](spin_op *self, const std::vector<double> &data,
+             std::size_t num_qubits) {
             PyErr_WarnEx(
                 PyExc_DeprecationWarning,
                 "serialization format changed - use the constructor without a "
                 "size_t argument to create a spin_op from the new format",
                 1);
-            return spin_op(data, num_qubits);
-          }),
-          py::arg("data"), py::arg("num_qubits"),
+            new (self) spin_op(data, num_qubits);
+          },
+          nanobind::arg("data"), nanobind::arg("num_qubits"),
           "Deprecated - use constructor without the `num_qubits` argument "
           "instead.")
       // new constructor with deprecation warning provided only for backwards
       // compatibility (matching the deprecated data constructor for the old
       // serialization format above)
-      .def(py::init([](const std::string &fileName, bool legacy) {
-             binary_spin_op_reader reader;
-             PyErr_WarnEx(
-                 PyExc_DeprecationWarning,
-                 "overload provided for compatibility with the deprecated "
-                 "serialization format - please migrate to the new format and "
-                 "use the constructor without boolean argument",
-                 1);
-             return reader.read(fileName, legacy);
-           }),
-           py::arg("filename"), py::arg("legacy"),
-           "Constructor available for loading deprecated data representations "
-           "from file - will be removed in future releases.")
+      .def(
+          "__init__",
+          [](spin_op *self, const std::string &fileName, bool legacy) {
+            binary_spin_op_reader reader;
+            PyErr_WarnEx(
+                PyExc_DeprecationWarning,
+                "overload provided for compatibility with the deprecated "
+                "serialization format - please migrate to the new format and "
+                "use the constructor without boolean argument",
+                1);
+            new (self) spin_op(reader.read(fileName, legacy));
+          },
+          nanobind::arg("filename"), nanobind::arg("legacy"),
+          "Constructor available for loading deprecated data representations "
+          "from file - will be removed in future releases.")
       .def_static(
           "empty_op",
           []() {
@@ -568,27 +591,28 @@ void bindSpinOperator(py::module &mod) {
                          1);
             return self.to_string(print_coefficient);
           },
-          py::arg("print_coefficient") = true,
+          nanobind::arg("print_coefficient") = true,
           "Deprecated - use the standard `str` conversion or `get_pauli_word` "
           "on each term instead.")
       .def(
           "for_each_term",
-          [](spin_op &self, py::function functor) {
+          [](spin_op &self, nanobind::callable functor) {
             PyErr_WarnEx(PyExc_DeprecationWarning,
                          "use standard iteration instead", 1);
             self.for_each_term(functor);
           },
-          py::arg("function"), "Deprecated - use standard iteration instead.")
+          nanobind::arg("function"),
+          "Deprecated - use standard iteration instead.")
       .def(
           "for_each_pauli",
-          [](spin_op &self, py::function functor) {
+          [](spin_op &self, nanobind::callable functor) {
             PyErr_WarnEx(PyExc_DeprecationWarning,
                          "iterate over the sum to get each term and then "
                          "iterate over the term(s) instead",
                          1);
             self.for_each_pauli(functor);
           },
-          py::arg("function"),
+          nanobind::arg("function"),
           "Deprecated - iterator over sum and then iterator over term "
           "instead.");
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER))
@@ -602,49 +626,50 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "__iter__",
           [](spin_op_term &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<spin_op_term>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(), "Loop through each term of the operator.")
+          nanobind::keep_alive<0, 1>(),
+          "Loop through each term of the operator.")
 
       // properties
 
-      .def_property_readonly("parameters",
-                             &spin_op_term::get_parameter_descriptions,
-                             "Returns a dictionary that maps each parameter "
-                             "name to its description.")
-      .def_property_readonly("degrees", &spin_op_term::degrees,
-                             "Returns a vector that lists all degrees of "
-                             "freedom that the operator targets. "
-                             "The order of degrees is from smallest to largest "
-                             "and reflects the ordering of "
-                             "the matrix returned by `to_matrix`. "
-                             "Specifically, the indices of a statevector "
-                             "with two qubits are {00, 01, 10, 11}. An "
-                             "ordering of degrees {0, 1} then indicates "
-                             "that a state where the qubit with index 0 equals "
-                             "1 with probability 1 is given by "
-                             "the vector {0., 1., 0., 0.}.")
-      .def_property_readonly("min_degree", &spin_op_term::min_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("max_degree", &spin_op_term::max_degree,
-                             "Returns the smallest index of the degrees of "
-                             "freedom that the operator targets.")
-      .def_property_readonly("ops_count", &spin_op_term::num_ops,
-                             "Returns the number of operators in the product.")
-      .def_property_readonly(
+      .def_prop_ro("parameters", &spin_op_term::get_parameter_descriptions,
+                   "Returns a dictionary that maps each parameter "
+                   "name to its description.")
+      .def_prop_ro("degrees", &spin_op_term::degrees,
+                   "Returns a vector that lists all degrees of "
+                   "freedom that the operator targets. "
+                   "The order of degrees is from smallest to largest "
+                   "and reflects the ordering of "
+                   "the matrix returned by `to_matrix`. "
+                   "Specifically, the indices of a statevector "
+                   "with two qubits are {00, 01, 10, 11}. An "
+                   "ordering of degrees {0, 1} then indicates "
+                   "that a state where the qubit with index 0 equals "
+                   "1 with probability 1 is given by "
+                   "the vector {0., 1., 0., 0.}.")
+      .def_prop_ro("min_degree", &spin_op_term::min_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("max_degree", &spin_op_term::max_degree,
+                   "Returns the smallest index of the degrees of "
+                   "freedom that the operator targets.")
+      .def_prop_ro("ops_count", &spin_op_term::num_ops,
+                   "Returns the number of operators in the product.")
+      .def_prop_ro(
           "term_count", [](const spin_op_term &) { return 1; },
           "Returns the number of terms in the operator. Always returns 1.")
       // only exists for spin operators
-      .def_property_readonly(
-          "qubit_count", &spin_op_term::num_qubits<spin_handler>,
-          "Return the number of qubits this operator acts on.")
-      .def_property_readonly(
+      .def_prop_ro("qubit_count", &spin_op_term::num_qubits<spin_handler>,
+                   "Return the number of qubits this operator acts on.")
+      .def_prop_ro(
           "term_id", &spin_op_term::get_term_id,
           "The term id uniquely identifies the operators and targets (degrees) "
           "that they act on, "
           "but does not include information about the coefficient.")
-      .def_property_readonly(
+      .def_prop_ro(
           "coefficient", &spin_op_term::get_coefficient,
           "Returns the unevaluated coefficient of the operator. The "
           "coefficient is a "
@@ -652,60 +677,66 @@ void bindSpinOperator(py::module &mod) {
 
       // constructors
 
-      .def(py::init<>(),
+      .def(nanobind::init<>(),
            "Creates a product operator with constant value 1. The returned "
            "operator does not target any degrees of freedom but merely "
            "represents a constant.")
-      .def(py::init<std::size_t, std::size_t>(), py::arg("first_degree"),
-           py::arg("last_degree"),
+      .def(nanobind::init<std::size_t, std::size_t>(),
+           nanobind::arg("first_degree"), nanobind::arg("last_degree"),
            "Creates a product operator that applies an identity operation to "
            "all degrees of "
            "freedom in the range [first_degree, last_degree).")
       // NOTE: only supported on spin ops so far
-      .def(py::init([](const std::vector<double> &data) {
-             spin_op op(data);
-             if (op.num_terms() != 1)
-               throw std::runtime_error(
-                   "invalid data representation for product operator");
-             return *op.begin();
-           }),
-           py::arg("data"),
-           "Creates an operator based on a serialized data representation.")
+      .def(
+          "__init__",
+          [](spin_op_term *self, const std::vector<double> &data) {
+            spin_op op(data);
+            if (op.num_terms() != 1)
+              throw std::runtime_error(
+                  "invalid data representation for product operator");
+            new (self) spin_op_term(*op.begin());
+          },
+          nanobind::arg("data"),
+          "Creates an operator based on a serialized data representation.")
       // NOTE: only supported on spin ops so far
-      .def(py::init([](const std::string &fileName) {
-             binary_spin_op_reader reader;
-             spin_op op = reader.read(fileName);
-             if (op.num_terms() != 1)
-               throw std::runtime_error(
-                   "invalid data representation for product operator");
-             return *op.begin();
-           }),
-           "Creates an operator based on a serialized data representation in "
-           "the given file.")
-      .def(py::init<double>(),
+      .def(
+          "__init__",
+          [](spin_op_term *self, const std::string &fileName) {
+            binary_spin_op_reader reader;
+            spin_op op = reader.read(fileName);
+            if (op.num_terms() != 1)
+              throw std::runtime_error(
+                  "invalid data representation for product operator");
+            new (self) spin_op_term(*op.begin());
+          },
+          "Creates an operator based on a serialized data representation in "
+          "the given file.")
+      .def(nanobind::init<double>(),
            "Creates a product operator with the given constant value. "
            "The returned operator does not target any degrees of freedom.")
-      .def(py::init<std::complex<double>>(),
+      .def(nanobind::init<std::complex<double>>(),
            "Creates a product operator with the given "
            "constant value. The returned operator does not target any degrees "
            "of freedom.")
-      .def(py::init([](const scalar_operator &scalar) {
-             return spin_op_term() * scalar;
-           }),
-           "Creates a product operator with non-constant scalar value.")
-      .def(py::init<spin_handler>(),
+      .def(
+          "__init__",
+          [](spin_op_term *self, const scalar_operator &scalar) {
+            new (self) spin_op_term(spin_op_term() * scalar);
+          },
+          "Creates a product operator with non-constant scalar value.")
+      .def(nanobind::init<spin_handler>(),
            "Creates a product operator with the given elementary operator.")
-      .def(py::init<const spin_op_term &, std::size_t>(), py::arg("operator"),
-           py::arg("size") = 0,
+      .def(nanobind::init<const spin_op_term &, std::size_t>(),
+           nanobind::arg("operator"), nanobind::arg("size") = 0,
            "Creates a copy of the given operator and reserves space for "
            "storing the given "
            "number of product terms (if a size is provided).")
       .def_static(
           "from_json",
           [](const std::string &json_str) {
-            py::object json = py::module_::import("json");
-            auto data = py::list(json.attr("loads")(json_str));
-            spin_op op(data.cast<std::vector<double>>());
+            nanobind::object json = nanobind::module_::import_("json");
+            auto data = nanobind::list(json.attr("loads")(json_str));
+            spin_op op(nanobind::cast<std::vector<double>>(data));
             if (op.num_terms() != 1)
               throw std::runtime_error(
                   "invalid data representation for product operator");
@@ -719,7 +750,7 @@ void bindSpinOperator(py::module &mod) {
       // evaluations
 
       .def("evaluate_coefficient", &spin_op_term::evaluate_coefficient,
-           py::arg("parameters") = parameter_map(),
+           nanobind::arg("parameters") = parameter_map(),
            "Returns the evaluated coefficient of the product operator. The "
            "parameters is a map of parameter names to their concrete, complex "
            "values.")
@@ -730,9 +761,9 @@ void bindSpinOperator(py::module &mod) {
             auto cmat = self.to_matrix(dimensions, params, invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -742,13 +773,13 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_matrix",
           [](const spin_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             auto cmat = self.to_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
             return details::cmat_to_numpy(cmat);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Returns the matrix representation of the operator."
           "The matrix is ordered according to the convention (endianness) "
           "used in CUDA-Q, and the ordering returned by `degrees`. This order "
@@ -761,9 +792,9 @@ void bindSpinOperator(py::module &mod) {
              const parameter_map &params, bool invert_order) {
             return self.to_sparse_matrix(dimensions, params, invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("parameters") = parameter_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("parameters") = parameter_map(),
+          nanobind::arg("invert_order") = false,
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -777,12 +808,12 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_sparse_matrix",
           [](const spin_op_term &self, dimension_map &dimensions,
-             bool invert_order, const py::kwargs &kwargs) {
+             bool invert_order, const nanobind::kwargs &kwargs) {
             return self.to_sparse_matrix(
                 dimensions, details::kwargs_to_param_map(kwargs), invert_order);
           },
-          py::arg("dimensions") = dimension_map(),
-          py::arg("invert_order") = false,
+          nanobind::arg("dimensions") = dimension_map(),
+          nanobind::arg("invert_order") = false, nanobind::arg("kwargs"),
           "Return the sparse matrix representation of the operator. This "
           "representation is a "
           "`Tuple[list[complex], list[int], list[int]]`, encoding the "
@@ -796,7 +827,7 @@ void bindSpinOperator(py::module &mod) {
 
       // comparisons
 
-      .def("__eq__", &spin_op_term::operator==, py::is_operator(),
+      .def("__eq__", &spin_op_term::operator==, nanobind::is_operator(),
            "Return true if the two operators are equivalent. The equivalence "
            "check takes "
            "commutation relations into account. Operators acting on different "
@@ -808,77 +839,78 @@ void bindSpinOperator(py::module &mod) {
           [](const spin_op_term &self, const spin_op &other) {
             return other.num_terms() == 1 && *other.begin() == self;
           },
-          py::is_operator(), "Return true if the two operators are equivalent.")
+          nanobind::is_operator(),
+          "Return true if the two operators are equivalent.")
 
       // unary operators
 
-      .def(-py::self, py::is_operator())
-      .def(+py::self, py::is_operator())
+      .def(-nanobind::self, nanobind::is_operator())
+      .def(+nanobind::self, nanobind::is_operator())
 
       // in-place arithmetics
 
-      .def(py::self /= int(), py::is_operator())
-      .def(py::self *= int(), py::is_operator())
-      .def(py::self /= double(), py::is_operator())
-      .def(py::self *= double(), py::is_operator())
-      .def(py::self /= std::complex<double>(), py::is_operator())
-      .def(py::self *= std::complex<double>(), py::is_operator())
-      .def(py::self /= scalar_operator(), py::is_operator())
-      .def(py::self *= scalar_operator(), py::is_operator())
-      .def(py::self *= py::self, py::is_operator())
+      .def(nanobind::self /= int(), nanobind::is_operator())
+      .def(nanobind::self *= int(), nanobind::is_operator())
+      .def(nanobind::self /= double(), nanobind::is_operator())
+      .def(nanobind::self *= double(), nanobind::is_operator())
+      .def(nanobind::self /= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self *= std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self /= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self *= nanobind::self, nanobind::is_operator())
 
       // right-hand arithmetics
 
-      .def(py::self / int(), py::is_operator())
-      .def(py::self * int(), py::is_operator())
-      .def(py::self + int(), py::is_operator())
-      .def(py::self - int(), py::is_operator())
-      .def(py::self / double(), py::is_operator())
-      .def(py::self * double(), py::is_operator())
-      .def(py::self + double(), py::is_operator())
-      .def(py::self - double(), py::is_operator())
-      .def(py::self / std::complex<double>(), py::is_operator())
-      .def(py::self * std::complex<double>(), py::is_operator())
-      .def(py::self + std::complex<double>(), py::is_operator())
-      .def(py::self - std::complex<double>(), py::is_operator())
-      .def(py::self / scalar_operator(), py::is_operator())
-      .def(py::self * scalar_operator(), py::is_operator())
-      .def(py::self + scalar_operator(), py::is_operator())
-      .def(py::self - scalar_operator(), py::is_operator())
-      .def(py::self * py::self, py::is_operator())
-      .def(py::self + py::self, py::is_operator())
-      .def(py::self - py::self, py::is_operator())
-      .def(py::self * spin_op(), py::is_operator())
-      .def(py::self + spin_op(), py::is_operator())
-      .def(py::self - spin_op(), py::is_operator())
-      .def(py::self * matrix_op_term(), py::is_operator())
-      .def(py::self + matrix_op_term(), py::is_operator())
-      .def(py::self - matrix_op_term(), py::is_operator())
-      .def(py::self * matrix_op(), py::is_operator())
-      .def(py::self + matrix_op(), py::is_operator())
-      .def(py::self - matrix_op(), py::is_operator())
+      .def(nanobind::self / int(), nanobind::is_operator())
+      .def(nanobind::self * int(), nanobind::is_operator())
+      .def(nanobind::self + int(), nanobind::is_operator())
+      .def(nanobind::self - int(), nanobind::is_operator())
+      .def(nanobind::self / double(), nanobind::is_operator())
+      .def(nanobind::self * double(), nanobind::is_operator())
+      .def(nanobind::self + double(), nanobind::is_operator())
+      .def(nanobind::self - double(), nanobind::is_operator())
+      .def(nanobind::self / std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self * std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self + std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self - std::complex<double>(), nanobind::is_operator())
+      .def(nanobind::self / scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self + scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self - scalar_operator(), nanobind::is_operator())
+      .def(nanobind::self * nanobind::self, nanobind::is_operator())
+      .def(nanobind::self + nanobind::self, nanobind::is_operator())
+      .def(nanobind::self - nanobind::self, nanobind::is_operator())
+      .def(nanobind::self * spin_op(), nanobind::is_operator())
+      .def(nanobind::self + spin_op(), nanobind::is_operator())
+      .def(nanobind::self - spin_op(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op_term(), nanobind::is_operator())
+      .def(nanobind::self * matrix_op(), nanobind::is_operator())
+      .def(nanobind::self + matrix_op(), nanobind::is_operator())
+      .def(nanobind::self - matrix_op(), nanobind::is_operator())
 
       // left-hand arithmetics
 
-      .def(int() * py::self, py::is_operator())
-      .def(int() + py::self, py::is_operator())
-      .def(int() - py::self, py::is_operator())
-      .def(double() * py::self, py::is_operator())
-      .def(double() + py::self, py::is_operator())
-      .def(double() - py::self, py::is_operator())
-      .def(std::complex<double>() * py::self, py::is_operator())
-      .def(std::complex<double>() + py::self, py::is_operator())
-      .def(std::complex<double>() - py::self, py::is_operator())
-      .def(scalar_operator() * py::self, py::is_operator())
-      .def(scalar_operator() + py::self, py::is_operator())
-      .def(scalar_operator() - py::self, py::is_operator())
+      .def(int() * nanobind::self, nanobind::is_operator())
+      .def(int() + nanobind::self, nanobind::is_operator())
+      .def(int() - nanobind::self, nanobind::is_operator())
+      .def(double() * nanobind::self, nanobind::is_operator())
+      .def(double() + nanobind::self, nanobind::is_operator())
+      .def(double() - nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() * nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() + nanobind::self, nanobind::is_operator())
+      .def(std::complex<double>() - nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() * nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() + nanobind::self, nanobind::is_operator())
+      .def(scalar_operator() - nanobind::self, nanobind::is_operator())
 
       // general utility functions
 
       .def("is_identity", &spin_op_term::is_identity,
            "Checks if all operators in the product are the identity. "
-           "Note: this function returns true regardless of the value of the "
-           "coefficient.")
+           "Note that this function returns true regardless of the value of "
+           "the coefficient.")
       .def(
           "__str__", [](const spin_op_term &self) { return self.to_string(); },
           "Returns the string representation of the operator.")
@@ -896,18 +928,18 @@ void bindSpinOperator(py::module &mod) {
       .def(
           "to_json",
           [](const spin_op_term &self) {
-            py::object json = py::module_::import("json");
+            nanobind::object json = nanobind::module_::import_("json");
             auto data = spin_op(self).get_data_representation();
             return json.attr("dumps")(data);
           },
-          "Convert spin_op to JSON string: '[d1, d2, d3, ...]'")
+          "Convert spin_op to a JSON string, e.g., '[d1, d2, d3, ...]'.")
       // only exists for spin operators
       .def(
           "get_pauli_word",
           [](spin_op_term &op, std::size_t pad_identities) {
             return op.get_pauli_word(pad_identities);
           },
-          py::arg("pad_identities") = 0,
+          nanobind::arg("pad_identities") = 0,
           "Gets the Pauli word representation of this product operator.")
       // only exists for spin operators
       .def("get_binary_symplectic_form",
@@ -973,7 +1005,7 @@ void bindSpinOperator(py::module &mod) {
                          1);
             return self.to_string(print_coefficient);
           },
-          py::arg("print_coefficient") = true,
+          nanobind::arg("print_coefficient") = true,
           "Deprecated - use the standard `str` conversion or use "
           "`get_pauli_word` instead.")
       .def(
@@ -985,18 +1017,19 @@ void bindSpinOperator(py::module &mod) {
                          1);
             return spin_op(op).distribute_terms(chunks);
           },
-          py::arg("chunk_count"),
+          nanobind::arg("chunk_count"),
           "Deprecated - instantiate a `SpinOperator` from this "
           "`SpinOperatorTerm` "
           "and call distribute_terms on that.")
       .def(
           "for_each_pauli",
-          [](spin_op_term &self, py::function functor) {
+          [](spin_op_term &self, nanobind::callable functor) {
             PyErr_WarnEx(PyExc_DeprecationWarning,
                          "use standard iteration instead", 1);
             spin_op(self).for_each_pauli(functor);
           },
-          py::arg("function"), "Deprecated - use standard iteration instead.");
+          nanobind::arg("function"),
+          "Deprecated - use standard iteration instead.");
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER))
 #pragma GCC diagnostic pop
 #endif
@@ -1005,12 +1038,12 @@ void bindSpinOperator(py::module &mod) {
 #endif
 }
 
-void bindSpinWrapper(py::module &mod) {
+void bindSpinWrapper(nanobind::module_ &mod) {
   bindSpinOperator(mod);
-  py::implicitly_convertible<double, spin_op_term>();
-  py::implicitly_convertible<std::complex<double>, spin_op_term>();
-  py::implicitly_convertible<scalar_operator, spin_op_term>();
-  py::implicitly_convertible<spin_op_term, spin_op>();
+  nanobind::implicitly_convertible<double, spin_op_term>();
+  nanobind::implicitly_convertible<std::complex<double>, spin_op_term>();
+  nanobind::implicitly_convertible<scalar_operator, spin_op_term>();
+  nanobind::implicitly_convertible<spin_op_term, spin_op>();
   bindSpinModule(mod);
 }
 
diff --git a/python/runtime/cudaq/operators/py_spin_op.h b/python/runtime/cudaq/operators/py_spin_op.h
index 592458ca681..3d0b7df7a8b 100644
--- a/python/runtime/cudaq/operators/py_spin_op.h
+++ b/python/runtime/cudaq/operators/py_spin_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of `cudaq::spin`
 /// and `cudaq::spin_op` to python.
-void bindSpinWrapper(py::module &mod);
+void bindSpinWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_super_op.cpp b/python/runtime/cudaq/operators/py_super_op.cpp
index 730064dbb72..2c18dfbc820 100644
--- a/python/runtime/cudaq/operators/py_super_op.cpp
+++ b/python/runtime/cudaq/operators/py_super_op.cpp
@@ -7,10 +7,14 @@
  ******************************************************************************/
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include <pybind11/stl.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 #include "cudaq/operators.h"
 #include "py_helpers.h"
@@ -18,53 +22,54 @@
 
 namespace cudaq {
 
-void bindSuperOperatorWrapper(py::module &mod) {
-  auto super_op_class = py::class_<super_op>(mod, "SuperOperator");
+void bindSuperOperatorWrapper(nanobind::module_ &mod) {
+  auto super_op_class = nanobind::class_<super_op>(mod, "SuperOperator");
 
   super_op_class
-      .def(py::init<>(), "Creates a default instantiated super-operator. A "
-                         "default instantiated "
-                         "super-operator means a no action linear map.")
-      .def_static(
-          "left_multiply",
-          py::overload_cast<const cudaq::product_op<cudaq::matrix_handler> &>(
-              &super_op::left_multiply),
-          "Creates a super-operator representing a left "
-          "multiplication of the operator to the density matrix.")
-      .def_static(
-          "right_multiply",
-          py::overload_cast<const cudaq::product_op<cudaq::matrix_handler> &>(
-              &super_op::right_multiply),
-          "Creates a super-operator representing a right "
-          "multiplication of the operator to the density matrix.")
-      .def_static(
-          "left_right_multiply",
-          py::overload_cast<const cudaq::product_op<cudaq::matrix_handler> &,
-                            const cudaq::product_op<cudaq::matrix_handler> &>(
-              &super_op::left_right_multiply),
-          "Creates a super-operator representing a simultaneous left "
-          "multiplication of the first operator operand and right "
-          "multiplication of the second operator operand to the "
-          "density matrix.")
+      .def(nanobind::init<>(),
+           "Creates a default instantiated super-operator. A "
+           "default instantiated "
+           "super-operator means a no action linear map.")
+      .def_static("left_multiply",
+                  nanobind::overload_cast<
+                      const cudaq::product_op<cudaq::matrix_handler> &>(
+                      &super_op::left_multiply),
+                  "Creates a super-operator representing a left "
+                  "multiplication of the operator to the density matrix.")
+      .def_static("right_multiply",
+                  nanobind::overload_cast<
+                      const cudaq::product_op<cudaq::matrix_handler> &>(
+                      &super_op::right_multiply),
+                  "Creates a super-operator representing a right "
+                  "multiplication of the operator to the density matrix.")
+      .def_static("left_right_multiply",
+                  nanobind::overload_cast<
+                      const cudaq::product_op<cudaq::matrix_handler> &,
+                      const cudaq::product_op<cudaq::matrix_handler> &>(
+                      &super_op::left_right_multiply),
+                  "Creates a super-operator representing a simultaneous left "
+                  "multiplication of the first operator operand and right "
+                  "multiplication of the second operator operand to the "
+                  "density matrix.")
 
       .def_static(
           "left_multiply",
-          py::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
+          nanobind::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
               &super_op::left_multiply),
           "Creates a super-operator representing a left "
           "multiplication of the operator to the density matrix. The sum is "
           "distributed into a linear combination of super-operator actions.")
       .def_static(
           "right_multiply",
-          py::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
+          nanobind::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &>(
               &super_op::right_multiply),
           "Creates a super-operator representing a right "
           "multiplication of the operator to the density matrix. The sum is "
           "distributed into a linear combination of super-operator actions.")
       .def_static(
           "left_right_multiply",
-          py::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &,
-                            const cudaq::sum_op<cudaq::matrix_handler> &>(
+          nanobind::overload_cast<const cudaq::sum_op<cudaq::matrix_handler> &,
+                                  const cudaq::sum_op<cudaq::matrix_handler> &>(
               &super_op::left_right_multiply),
           "Creates a super-operator representing a simultaneous left "
           "multiplication of the first operator operand and right "
@@ -74,11 +79,13 @@ void bindSuperOperatorWrapper(py::module &mod) {
       .def(
           "__iter__",
           [](super_op &self) {
-            return py::make_iterator(self.begin(), self.end());
+            return nanobind::make_iterator(nanobind::type<super_op>(),
+                                           "iterator", self.begin(),
+                                           self.end());
           },
-          py::keep_alive<0, 1>(),
+          nanobind::keep_alive<0, 1>(),
           "Loop through each term of the super-operator.")
-      .def(py::self += py::self, py::is_operator());
+      .def(nanobind::self += nanobind::self, nanobind::is_operator());
 }
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/operators/py_super_op.h b/python/runtime/cudaq/operators/py_super_op.h
index 32474d65639..da8c5e3ea3a 100644
--- a/python/runtime/cudaq/operators/py_super_op.h
+++ b/python/runtime/cudaq/operators/py_super_op.h
@@ -6,12 +6,10 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Wrapper function for exposing the bindings of super-operator to
 /// python.
-void bindSuperOperatorWrapper(py::module &mod);
+void bindSuperOperatorWrapper(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 9cf74e898c0..dd5b60c6823 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -9,7 +9,6 @@
 #include "py_alt_launch_kernel.h"
 #include "common/AnalogHamiltonian.h"
 #include "common/ArgumentWrapper.h"
-#include "common/CompiledModule.h"
 #include "common/Environment.h"
 #include "cudaq/Optimizer/Builder/Marshal.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
@@ -24,6 +23,7 @@
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "runtime/cudaq/algorithms/py_utils.h"
 #include "utils/LinkedLibraryHolder.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
 #include "utils/PyTypes.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -31,7 +31,6 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/ExecutionEngine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
@@ -42,10 +41,16 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/Passes.h"
 #include <fmt/core.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
 using namespace mlir;
 using namespace cudaq_internal::compiler;
 using cudaq::JitEngine;
@@ -128,45 +133,46 @@ void cudaq::setDataLayout(MlirModule module) {
 // The section is the implementation of functions declared in OpaqueArguments.h
 //===----------------------------------------------------------------------===//
 
-py::args cudaq::simplifiedValidateInputArguments(py::args &args) {
-  py::args processed = py::tuple(args.size());
+nanobind::args cudaq::simplifiedValidateInputArguments(nanobind::args &args) {
+  nanobind::args processed =
+      nanobind::steal<nanobind::args>(PyTuple_New((Py_ssize_t)args.size()));
   for (std::size_t i = 0; i < args.size(); ++i) {
-    auto arg = args[i];
+    nanobind::object arg = nanobind::borrow(args[i]);
     // Check if it has tolist, so it might be a 1d buffer (array / numpy
     // ndarray)
-    if (py::hasattr(args[i], "tolist")) {
+    if (nanobind::hasattr(args[i], "tolist")) {
       // This is a valid ndarray if it has tolist and shape
-      if (!py::hasattr(args[i], "shape"))
+      if (!nanobind::hasattr(args[i], "shape"))
         throw std::runtime_error(
             "Invalid input argument type, could not get shape of array.");
 
       // This is an ndarray with tolist() and shape attributes
       // get the shape and check its size
-      auto shape = args[i].attr("shape").cast<py::tuple>();
+      auto shape = nanobind::cast<nanobind::tuple>(args[i].attr("shape"));
       if (shape.size() != 1)
         throw std::runtime_error("Cannot pass ndarray with shape != (N,).");
 
       arg = args[i].attr("tolist")();
-    } else if (py::isinstance<py::str>(arg)) {
-      arg = py::cast<std::string>(arg);
-    } else if (py::isinstance<py::list>(arg)) {
-      py::list arg_list = py::cast<py::list>(arg);
+    } else if (nanobind::isinstance<nanobind::str>(arg)) {
+      arg = nanobind::cast(nanobind::cast<std::string>(arg));
+    } else if (nanobind::isinstance<nanobind::list>(arg)) {
+      nanobind::list arg_list = nanobind::cast<nanobind::list>(arg);
       const bool all_strings = [&]() {
-        for (auto &item : arg_list)
-          if (!py::isinstance<py::str>(item))
+        for (auto item : arg_list)
+          if (!nanobind::isinstance<nanobind::str>(item))
             return false;
         return true;
       }();
       if (all_strings) {
         std::vector<cudaq::pauli_word> pw_list;
         pw_list.reserve(arg_list.size());
-        for (auto &item : arg_list)
-          pw_list.emplace_back(py::cast<std::string>(item));
-        arg = std::move(pw_list);
+        for (auto item : arg_list)
+          pw_list.emplace_back(nanobind::cast<std::string>(item));
+        arg = nanobind::cast(std::move(pw_list));
       }
     }
 
-    processed[i] = arg;
+    PyTuple_SET_ITEM(processed.ptr(), (Py_ssize_t)i, arg.inc_ref().ptr());
   }
 
   return processed;
@@ -174,7 +180,7 @@ py::args cudaq::simplifiedValidateInputArguments(py::args &args) {
 
 void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
                                        mlir::Type memberType,
-                                       py::object value) {
+                                       nanobind::object value) {
   auto appendValue = [](void *data, auto &&value, std::size_t offset) {
     std::memcpy(((char *)data) + offset, &value,
                 sizeof(std::remove_cvref_t<decltype(value)>));
@@ -182,22 +188,23 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
   llvm::TypeSwitch<mlir::Type, void>(memberType)
       .Case([&](mlir::IntegerType ty) {
         if (ty.isInteger(1)) {
-          appendValue(data, (bool)value.cast<py::bool_>(), offset);
+          appendValue(data, nanobind::cast<bool>(value), offset);
           return;
         }
-        appendValue(data, (std::int64_t)value.cast<py::int_>(), offset);
+        appendValue(data, nanobind::cast<std::int64_t>(value), offset);
       })
       .Case([&](mlir::Float64Type ty) {
-        appendValue(data, (double)value.cast<py::float_>(), offset);
+        appendValue(data, nanobind::cast<double>(value), offset);
       })
       .Case([&](cudaq::cc::StdvecType ty) {
-        auto appendVectorValue = []<typename T>(py::object value, void *data,
-                                                std::size_t offset, T) {
-          auto asList = value.cast<py::list>();
+        auto appendVectorValue = []<typename T>(nanobind::object value,
+                                                void *data, std::size_t offset,
+                                                T) {
+          auto asList = nanobind::cast<nanobind::list>(value);
           // Use the correct element type T (not always double).
           auto *values = new std::vector<T>(asList.size());
-          for (std::size_t i = 0; auto &v : asList)
-            (*values)[i++] = v.cast<T>();
+          for (std::size_t i = 0; auto v : asList)
+            (*values)[i++] = nanobind::cast<T>(v);
 
           std::memcpy(((char *)data) + offset, values, 16);
         };
@@ -225,10 +232,11 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
       });
 }
 
-void *cudaq::handleVectorElements(mlir::Type eleTy, py::list list) {
-  auto appendValue = []<typename T>(py::list list, auto &&converter) -> void * {
+void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
+  auto appendValue = []<typename T>(nanobind::list list,
+                                    auto &&converter) -> void * {
     std::vector<T> *values = new std::vector<T>(list.size());
-    for (std::size_t i = 0; auto &v : list) {
+    for (std::size_t i = 0; auto v : list) {
       auto converted = converter(v, i);
       (*values)[i++] = converted;
     }
@@ -239,70 +247,70 @@ void *cudaq::handleVectorElements(mlir::Type eleTy, py::list list) {
       .Case([&](mlir::IntegerType ty) {
         if (ty.getIntOrFloatBitWidth() == 1)
           return appendValue.template operator()<char>(
-              list, [](py::handle v, std::size_t i) {
-                checkListElementType<py::bool_>(v, i);
-                return v.cast<bool>();
+              list, [](nanobind::handle v, std::size_t i) {
+                checkListElementType<nanobind::bool_>(v, i);
+                return nanobind::cast<bool>(v);
               });
         if (ty.getIntOrFloatBitWidth() == 8)
           return appendValue.template operator()<std::int8_t>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Int>(v, i);
-                return v.cast<std::int8_t>();
+                return nanobind::cast<std::int8_t>(v);
               });
         if (ty.getIntOrFloatBitWidth() == 16)
           return appendValue.template operator()<std::int16_t>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Int>(v, i);
-                return v.cast<std::int16_t>();
+                return nanobind::cast<std::int16_t>(v);
               });
         if (ty.getIntOrFloatBitWidth() == 32)
           return appendValue.template operator()<std::int32_t>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Int>(v, i);
-                return v.cast<std::int32_t>();
+                return nanobind::cast<std::int32_t>(v);
               });
         return appendValue.template operator()<std::int64_t>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Int>(v, i);
-              return v.cast<std::int64_t>();
+              return nanobind::cast<std::int64_t>(v);
             });
       })
       .Case([&](mlir::Float32Type ty) {
         return appendValue.template operator()<float>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Float>(v, i);
-              return v.cast<float>();
+              return nanobind::cast<float>(v);
             });
       })
       .Case([&](mlir::Float64Type ty) {
         return appendValue.template operator()<double>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Float>(v, i);
-              return v.cast<double>();
+              return nanobind::cast<double>(v);
             });
       })
       .Case([&](cudaq::cc::CharspanType type) {
         return appendValue.template operator()<std::string>(
-            list, [](py::handle v, std::size_t i) {
-              return v.cast<cudaq::pauli_word>().str();
+            list, [](nanobind::handle v, std::size_t i) {
+              return nanobind::cast<cudaq::pauli_word>(v).str();
             });
       })
       .Case([&](mlir::ComplexType type) {
         if (mlir::isa<mlir::Float64Type>(type.getElementType()))
           return appendValue.template operator()<std::complex<double>>(
-              list, [](py::handle v, std::size_t i) {
+              list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<py_ext::Complex>(v, i);
-                return v.cast<std::complex<double>>();
+                return nanobind::cast<std::complex<double>>(v);
               });
         return appendValue.template operator()<std::complex<float>>(
-            list, [](py::handle v, std::size_t i) {
+            list, [](nanobind::handle v, std::size_t i) {
               checkListElementType<py_ext::Complex>(v, i);
-              return v.cast<std::complex<float>>();
+              return nanobind::cast<std::complex<float>>(v);
             });
       })
       .Case([&](cudaq::cc::StdvecType ty) {
         auto appendVectorValue = []<typename T>(mlir::Type eleTy,
-                                                py::list list) -> void * {
+                                                nanobind::list list) -> void * {
           auto *values = new std::vector<std::vector<T>>();
           for (std::size_t i = 0; i < list.size(); i++) {
             auto ptr = handleVectorElements(eleTy, list[i]);
@@ -336,16 +344,18 @@ std::string cudaq::mlirTypeToString(mlir::Type ty) {
   return msg;
 }
 
-void cudaq::packArgs(OpaqueArguments &argData, py::list args,
-                     mlir::ArrayRef<mlir::Type> mlirTys,
-                     const std::function<bool(OpaqueArguments &, py::object &,
-                                              unsigned)> &backupHandler,
-                     mlir::func::FuncOp kernelFuncOp) {
+void cudaq::packArgs(
+    OpaqueArguments &argData, nanobind::list args,
+    mlir::ArrayRef<mlir::Type> mlirTys,
+    const std::function<bool(OpaqueArguments &, nanobind::object &, unsigned)>
+        &backupHandler,
+    mlir::func::FuncOp kernelFuncOp) {
   if (args.size() == 0)
     return;
 
   for (auto [i, zippy] : llvm::enumerate(llvm::zip(args, mlirTys))) {
-    py::object arg = py::reinterpret_borrow<py::object>(std::get<0>(zippy));
+    nanobind::object arg =
+        nanobind::borrow<nanobind::object>(std::get<0>(zippy));
     Type kernelArgTy = std::get<1>(zippy);
     if (arg.is_none()) {
       argData.emplace_back(nullptr, [](void *ptr) {});
@@ -355,39 +365,41 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
         .Case([&](ComplexType ty) {
           checkArgumentType<py_ext::Complex>(arg, i);
           if (isa<Float64Type>(ty.getElementType())) {
-            addArgument(argData, arg.cast<std::complex<double>>());
+            addArgument(argData, nanobind::cast<std::complex<double>>(arg));
           } else if (isa<Float32Type>(ty.getElementType())) {
-            addArgument(argData, arg.cast<std::complex<float>>());
+            addArgument(argData, nanobind::cast<std::complex<float>>(arg));
           } else {
-            throw std::runtime_error("Invalid complex type argument: " +
-                                     py::str(args).cast<std::string>() +
-                                     " Type: " + mlirTypeToString(ty));
+            throw std::runtime_error(
+                "Invalid complex type argument: " +
+                nanobind::cast<std::string>(
+                    nanobind::steal(PyObject_Str(args.ptr()))) +
+                " Type: " + mlirTypeToString(ty));
           }
         })
         .Case([&](Float64Type ty) {
           checkArgumentType<py_ext::Float>(arg, i);
-          addArgument(argData, arg.cast<double>());
+          addArgument(argData, nanobind::cast<double>(arg));
         })
         .Case([&](Float32Type ty) {
           checkArgumentType<py_ext::Float>(arg, i);
-          addArgument(argData, arg.cast<float>());
+          addArgument(argData, nanobind::cast<float>(arg));
         })
         .Case([&](IntegerType ty) {
           if (ty.getIntOrFloatBitWidth() == 1) {
-            checkArgumentType<py::bool_>(arg, i);
-            addArgument(argData, static_cast<char>(arg.cast<bool>()));
+            checkArgumentType<nanobind::bool_>(arg, i);
+            addArgument(argData, static_cast<char>(nanobind::cast<bool>(arg)));
             return;
           }
 
           checkArgumentType<py_ext::Int>(arg, i);
-          addArgument(argData, arg.cast<std::int64_t>());
+          addArgument(argData, nanobind::cast<std::int64_t>(arg));
         })
         .Case([&](cc::CharspanType ty) {
-          addArgument(argData, arg.cast<pauli_word>().str());
+          addArgument(argData, nanobind::cast<pauli_word>(arg).str());
         })
         .Case([&](cc::PointerType ty) {
           if (isa<quake::StateType>(ty.getElementType())) {
-            auto *stateArg = arg.cast<state *>();
+            auto *stateArg = nanobind::cast<state *>(arg);
 
             if (stateArg == nullptr)
               throw std::runtime_error("Null cudaq::state* argument passed.");
@@ -413,9 +425,11 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
                   [](void *ptr) { /* do nothing, we don't own the state */ });
             }
           } else {
-            throw std::runtime_error("Invalid pointer type argument: " +
-                                     py::str(arg).cast<std::string>() +
-                                     " Type: " + mlirTypeToString(ty));
+            throw std::runtime_error(
+                "Invalid pointer type argument: " +
+                nanobind::cast<std::string>(
+                    nanobind::steal(PyObject_Str(arg.ptr()))) +
+                " Type: " + mlirTypeToString(ty));
           }
         })
         .Case([&](cc::StructType ty) {
@@ -424,16 +438,17 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
           auto memberTys = ty.getMembers();
           auto allocatedArg = std::malloc(size);
           if (ty.getName() == "tuple") {
-            auto elements = arg.cast<py::tuple>();
+            auto elements = nanobind::cast<nanobind::tuple>(arg);
             for (std::size_t i = 0; i < offsets.size(); i++)
               handleStructMemberVariable(allocatedArg, offsets[i], memberTys[i],
                                          elements[i]);
           } else {
-            py::dict attributes = arg.attr("__annotations__").cast<py::dict>();
+            nanobind::dict attributes =
+                nanobind::cast<nanobind::dict>(arg.attr("__annotations__"));
             for (std::size_t i = 0;
                  const auto &[attr_name, unused] : attributes) {
-              py::object attr_value =
-                  arg.attr(attr_name.cast<std::string>().c_str());
+              nanobind::object attr_value =
+                  arg.attr(nanobind::cast<std::string>(attr_name).c_str());
               handleStructMemberVariable(allocatedArg, offsets[i], memberTys[i],
                                          attr_value);
               i++;
@@ -443,15 +458,15 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
         })
         .Case([&](cc::StdvecType ty) {
           auto appendVectorValue = [&argData]<typename T>(Type eleTy,
-                                                          py::list list) {
+                                                          nanobind::list list) {
             auto allocatedArg = handleVectorElements(eleTy, list);
             argData.emplace_back(allocatedArg, [](void *ptr) {
               delete static_cast<std::vector<T> *>(ptr);
             });
           };
 
-          checkArgumentType<py::list>(arg, i);
-          auto list = py::cast<py::list>(arg);
+          checkArgumentType<nanobind::list>(arg, i);
+          auto list = nanobind::cast<nanobind::list>(arg);
           auto eleTy = ty.getElementType();
           if (eleTy.isInteger(1)) {
             // Special case for a `std::vector<bool>`.
@@ -463,14 +478,15 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
         })
         .Case([&](cc::CallableType ty) {
           // arg must be a DecoratorCapture object.
-          checkArgumentType<py::object>(arg, i);
-          if (py::hasattr(arg, "linkedKernel")) {
-            auto kernelName = arg.attr("linkedKernel").cast<std::string>();
+          checkArgumentType<nanobind::object>(arg, i);
+          if (nanobind::hasattr(arg, "linkedKernel")) {
+            auto kernelName =
+                nanobind::cast<std::string>(arg.attr("linkedKernel"));
             // TODO: This is kinda yucky to have to remove because it's already
             // present
             kernelName.erase(0, strlen(cudaq::runtime::cudaqGenPrefixName));
             auto kernelModule =
-                unwrap(arg.attr("qkeModule").cast<MlirModule>());
+                unwrap(nanobind::cast<MlirModule>(arg.attr("qkeModule")));
             OpaqueArguments resolvedArgs;
             argData.emplace_back(
                 new runtime::CallableClosureArgument(kernelName, kernelModule,
@@ -480,16 +496,18 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
                   delete static_cast<runtime::CallableClosureArgument *>(that);
                 });
           } else {
-            py::object decorator = arg.attr("decorator");
-            auto kernelName = decorator.attr("uniqName").cast<std::string>();
+            nanobind::object decorator = arg.attr("decorator");
+            auto kernelName =
+                nanobind::cast<std::string>(decorator.attr("uniqName"));
             auto kernelModule =
-                unwrap(decorator.attr("qkeModule").cast<MlirModule>());
+                unwrap(nanobind::cast<MlirModule>(decorator.attr("qkeModule")));
             auto calledFuncOp = kernelModule.lookupSymbol<func::FuncOp>(
                 cudaq::runtime::cudaqGenPrefixName + kernelName);
-            py::list arguments = arg.attr("resolved");
+            nanobind::list arguments = arg.attr("resolved");
             auto startLiftedArgs = [&]() -> std::optional<unsigned> {
               if (!arguments.empty())
-                return decorator.attr("formal_arity")().cast<unsigned>();
+                return nanobind::cast<unsigned>(
+                    decorator.attr("formal_arity")());
               return std::nullopt;
             }();
             // build the recursive closure in a C++ object
@@ -515,17 +533,20 @@ void cudaq::packArgs(OpaqueArguments &argData, py::list args,
           bool success = backupHandler(argData, arg, i);
           if (!success)
             throw std::runtime_error(
-                "Could not pack argument: " + py::str(arg).cast<std::string>() +
+                "Could not pack argument: " +
+                nanobind::cast<std::string>(
+                    nanobind::steal(PyObject_Str(arg.ptr()))) +
                 " Type: " + mlirTypeToString(ty));
         });
   }
 }
 
-void cudaq::packArgs(OpaqueArguments &argData, py::args args,
-                     mlir::func::FuncOp kernelFuncOp,
-                     const std::function<bool(OpaqueArguments &, py::object &,
-                                              unsigned)> &backupHandler,
-                     std::size_t startingArgIdx) {
+void cudaq::packArgs(
+    OpaqueArguments &argData, nanobind::args args,
+    mlir::func::FuncOp kernelFuncOp,
+    const std::function<bool(OpaqueArguments &, nanobind::object &, unsigned)>
+        &backupHandler,
+    std::size_t startingArgIdx) {
   if (args.size() == 0) {
     // Nothing to pack. This may be a full QIR pre-compile, which is perfectly
     // legit. At any rate, there is nothing to pack so return.
@@ -539,7 +560,7 @@ void cudaq::packArgs(OpaqueArguments &argData, py::args args,
                              std::to_string(args.size()) + " arguments.");
 
   // Move the args to a list, lopping off startingArgIdx args from the front.
-  py::list pyList;
+  nanobind::list pyList;
   for (auto [i, h] : llvm::enumerate(args)) {
     if (i < startingArgIdx)
       continue;
@@ -556,11 +577,11 @@ void cudaq::packArgs(OpaqueArguments &argData, py::args args,
 /// Mechanical merge of a callable argument (captured in a python decorator)
 /// when the call site is executed.
 static bool linkResolvedCallable(ModuleOp currMod, func::FuncOp entryPoint,
-                                 unsigned argPos, py::object arg) {
-  if (!py::hasattr(arg, "qkeModule"))
+                                 unsigned argPos, nanobind::object arg) {
+  if (!nanobind::hasattr(arg, "qkeModule"))
     return false;
-  auto uniqName = arg.attr("uniqName").cast<std::string>();
-  auto otherModule = arg.attr("qkeModule").cast<MlirModule>();
+  auto uniqName = nanobind::cast<std::string>(arg.attr("uniqName"));
+  auto otherModule = nanobind::cast<MlirModule>(arg.attr("qkeModule"));
   ModuleOp otherMod = unwrap(otherModule);
   std::string calleeName = cudaq::runtime::cudaqGenPrefixName + uniqName;
   auto callee = cudaq::getKernelFuncOp(otherModule, calleeName);
@@ -586,7 +607,8 @@ static bool linkResolvedCallable(ModuleOp currMod, func::FuncOp entryPoint,
 
 /// @brief Create a new OpaqueArguments pointer and pack the python arguments
 /// in it. Clients must delete the memory.
-cudaq::OpaqueArguments *cudaq::toOpaqueArgs(py::args &args, MlirModule mod,
+cudaq::OpaqueArguments *cudaq::toOpaqueArgs(nanobind::args &args,
+                                            MlirModule mod,
                                             const std::string &name) {
   auto kernelFunc = getKernelFuncOp(mod, name);
   auto *argData = new cudaq::OpaqueArguments();
@@ -594,7 +616,7 @@ cudaq::OpaqueArguments *cudaq::toOpaqueArgs(py::args &args, MlirModule mod,
   setDataLayout(mod);
   cudaq::packArgs(
       *argData, args, kernelFunc,
-      [](OpaqueArguments &, py::object &, unsigned) { return false; });
+      [](OpaqueArguments &, nanobind::object &, unsigned) { return false; });
   return argData;
 }
 
@@ -642,7 +664,7 @@ static void pyAltLaunchAnalogKernel(const std::string &name,
 }
 
 template <typename T>
-py::object readPyObject(Type ty, char *arg) {
+nanobind::object readPyObject(Type ty, char *arg) {
   std::size_t bytes = cudaq::byteSize(ty);
   if (sizeof(T) != bytes) {
     ty.dump();
@@ -658,11 +680,11 @@ py::object readPyObject(Type ty, char *arg) {
 
 /// Convert bytes in buffer, \p data, which are the result of the kernel
 /// launched to python object.
-py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
+nanobind::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
   auto isRunContext = module->hasAttr(runtime::enableCudaqRun);
 
-  return TypeSwitch<Type, py::object>(ty)
-      .Case([&](IntegerType ty) -> py::object {
+  return TypeSwitch<Type, nanobind::object>(ty)
+      .Case([&](IntegerType ty) -> nanobind::object {
         if (ty.getIntOrFloatBitWidth() == 1)
           return readPyObject<bool>(ty, data);
         if (ty.getIntOrFloatBitWidth() == 8)
@@ -673,28 +695,28 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
           return readPyObject<std::int32_t>(ty, data);
         return readPyObject<std::int64_t>(ty, data);
       })
-      .Case([&](ComplexType ty) -> py::object {
+      .Case([&](ComplexType ty) -> nanobind::object {
         auto eleTy = ty.getElementType();
-        return TypeSwitch<Type, py::object>(eleTy)
-            .Case([&](Float64Type eTy) -> py::object {
+        return TypeSwitch<Type, nanobind::object>(eleTy)
+            .Case([&](Float64Type eTy) -> nanobind::object {
               return readPyObject<std::complex<double>>(ty, data);
             })
-            .Case([&](Float32Type eTy) -> py::object {
+            .Case([&](Float32Type eTy) -> nanobind::object {
               return readPyObject<std::complex<float>>(ty, data);
             })
-            .Default([](Type eTy) -> py::object {
+            .Default([](Type eTy) -> nanobind::object {
               eTy.dump();
               throw std::runtime_error(
                   "Unsupported float element type for complex type return.");
             });
       })
-      .Case([&](Float64Type ty) -> py::object {
+      .Case([&](Float64Type ty) -> nanobind::object {
         return readPyObject<double>(ty, data);
       })
-      .Case([&](Float32Type ty) -> py::object {
+      .Case([&](Float32Type ty) -> nanobind::object {
         return readPyObject<float>(ty, data);
       })
-      .Case([&](cudaq::cc::StdvecType ty) -> py::object {
+      .Case([&](cudaq::cc::StdvecType ty) -> nanobind::object {
         if (isRunContext) {
           // cudaq.run return.
           auto eleTy = ty.getElementType();
@@ -707,9 +729,9 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
           // `std::vector<bool>`.
           if (eleTy.getIntOrFloatBitWidth() == 1) {
             auto v = reinterpret_cast<std::vector<bool> *>(data);
-            py::list list;
+            nanobind::list list;
             for (auto const bit : *v)
-              list.append(py::bool_(bit));
+              list.append(nanobind::bool_(bit));
             return list;
           }
 
@@ -723,7 +745,7 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
           auto v = reinterpret_cast<vec *>(data);
 
           // Read vector elements.
-          py::list list;
+          nanobind::list list;
           for (char *i = v->begin; i < v->end; i += eleByteSize)
             list.append(convertResult(module, eleTy, i));
           return list;
@@ -742,19 +764,19 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         auto v = reinterpret_cast<vec *>(data);
 
         // Read vector elements.
-        py::list list;
+        nanobind::list list;
         std::size_t byteLength = v->length * eleByteSize;
         for (std::size_t i = 0; i < byteLength; i += eleByteSize)
           list.append(convertResult(module, eleTy, v->data + i));
         return list;
       })
-      .Case([&](cudaq::cc::StructType ty) -> py::object {
+      .Case([&](cudaq::cc::StructType ty) -> nanobind::object {
         auto name = ty.getName().str();
         // Handle tuples.
         if (name == "tuple") {
           auto [size, offsets] = getTargetLayout(module, ty);
           auto memberTys = ty.getMembers();
-          py::list list;
+          nanobind::list list;
           for (std::size_t i = 0; i < offsets.size(); i++) {
             auto eleTy = memberTys[i];
             if (!eleTy.isIntOrFloat()) {
@@ -765,7 +787,7 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
             }
             list.append(convertResult(module, eleTy, data + offsets[i]));
           }
-          return py::tuple(list);
+          return nanobind::tuple(list);
         }
 
         // Handle data class objects.
@@ -776,14 +798,14 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         auto [cls, attributes] = DataClassRegistry::getClassAttributes(name);
 
         // Collect field names.
-        std::vector<py::str> fieldNames;
+        std::vector<nanobind::str> fieldNames;
         for (const auto &[attr_name, unused] : attributes)
-          fieldNames.emplace_back(py::str(attr_name));
+          fieldNames.emplace_back(nanobind::str(attr_name));
 
         // Read field values and create the constructor `kwargs`
         auto [size, offsets] = getTargetLayout(module, ty);
         auto memberTys = ty.getMembers();
-        py::dict kwargs;
+        nanobind::dict kwargs;
         for (std::size_t i = 0; i < offsets.size(); i++) {
           auto eleTy = memberTys[i];
           if (!eleTy.isIntOrFloat()) {
@@ -804,7 +826,7 @@ py::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         // Create python object of class `cls` with the collected args.
         return cls(**kwargs);
       })
-      .Default([](Type ty) -> py::object {
+      .Default([](Type ty) -> nanobind::object {
         ty.dump();
         throw std::runtime_error("Unsupported return type.");
       });
@@ -828,22 +850,21 @@ cudaq::clean_launch_module(const std::string &name, ModuleOp mod,
   return pyLaunchModule(name, mod, rawArgs);
 }
 
-cudaq::OpaqueArguments
-cudaq::marshal_arguments_for_module_launch(ModuleOp mod, py::args runtimeArgs,
-                                           func::FuncOp kernelFunc) {
+cudaq::OpaqueArguments cudaq::marshal_arguments_for_module_launch(
+    ModuleOp mod, nanobind::args runtimeArgs, func::FuncOp kernelFunc) {
   // Convert python arguments to opaque form.
   cudaq::OpaqueArguments args;
   cudaq::packArgs(
       args, runtimeArgs, kernelFunc,
-      [&](cudaq::OpaqueArguments &args, py::object &pyArg, unsigned pos) {
+      [&](cudaq::OpaqueArguments &args, nanobind::object &pyArg, unsigned pos) {
         return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
       });
   return args;
 }
 
-py::object cudaq::marshal_and_launch_module(const std::string &name,
-                                            MlirModule module,
-                                            py::args runtimeArgs) {
+nanobind::object cudaq::marshal_and_launch_module(const std::string &name,
+                                                  MlirModule module,
+                                                  nanobind::args runtimeArgs) {
   ScopedTraceWithContext("marshal_and_launch_module", name);
   auto kernelFunc = getKernelFuncOp(module, name);
   auto mod = unwrap(module);
@@ -853,18 +874,15 @@ py::object cudaq::marshal_and_launch_module(const std::string &name,
   // FIXME: handle dynamic sized results!
 
   if (!retTy)
-    return py::none();
+    return nanobind::none();
   return cudaq::convertResult(mod, retTy,
                               reinterpret_cast<char *>(args.getArgs().back()));
 }
 
 // Compile (specialize + JIT) the kernel module and return a CompiledModule.
-// The returned instance owns the JIT engine and manages its lifetime using
-// RAII.
-static cudaq::CompiledModule marshal_and_retain_module(const std::string &name,
-                                                       MlirModule module,
-                                                       bool isEntryPoint,
-                                                       py::args runtimeArgs) {
+static cudaq::CompiledModule
+marshal_and_retain_module(const std::string &name, MlirModule module,
+                          bool isEntryPoint, nanobind::args runtimeArgs) {
   ScopedTraceWithContext("marshal_and_retain_module", name);
 
   auto kernelFunc = cudaq::getKernelFuncOp(module, name);
@@ -881,10 +899,11 @@ static cudaq::CompiledModule marshal_and_retain_module(const std::string &name,
   return compiled;
 }
 
-static MlirModule synthesizeKernel(py::object kernel, py::args runtimeArgs) {
-  auto module = kernel.attr("qkeModule").cast<MlirModule>();
+static MlirModule synthesizeKernel(nanobind::object kernel,
+                                   nanobind::args runtimeArgs) {
+  auto module = nanobind::cast<MlirModule>(kernel.attr("qkeModule"));
   auto mod = unwrap(module);
-  auto name = kernel.attr("uniqName").cast<std::string>();
+  auto name = nanobind::cast<std::string>(kernel.attr("uniqName"));
   if (mod->hasAttr(cudaq::runtime::pythonUniqueAttrName)) {
     StringRef n =
         cast<StringAttr>(mod->getAttr(cudaq::runtime::pythonUniqueAttrName));
@@ -893,9 +912,10 @@ static MlirModule synthesizeKernel(py::object kernel, py::args runtimeArgs) {
   auto kernelFuncOp = cudaq::getKernelFuncOp(module, name);
   cudaq::OpaqueArguments args;
   cudaq::setDataLayout(module);
-  cudaq::packArgs(
-      args, runtimeArgs, kernelFuncOp,
-      [](cudaq::OpaqueArguments &, py::object &, unsigned) { return false; });
+  cudaq::packArgs(args, runtimeArgs, kernelFuncOp,
+                  [](cudaq::OpaqueArguments &, nanobind::object &, unsigned) {
+                    return false;
+                  });
 
   ScopedTraceWithContext(cudaq::TIMING_JIT, "synthesizeKernel", name);
   auto rawArgs = appendResultToArgsVector(args, {}, mod, name);
@@ -1039,13 +1059,14 @@ static ModuleOp cleanLowerToCodegenKernel(ModuleOp mod,
 }
 
 static MlirModule lower_to_codegen(const std::string &kernelName,
-                                   MlirModule module, py::args runtimeArgs) {
+                                   MlirModule module,
+                                   nanobind::args runtimeArgs) {
   auto kernelFunc = cudaq::getKernelFuncOp(module, kernelName);
   cudaq::OpaqueArguments args;
   auto mod = unwrap(module);
   cudaq::packArgs(
       args, runtimeArgs, kernelFunc,
-      [&](cudaq::OpaqueArguments &args, py::object &pyArg, unsigned pos) {
+      [&](cudaq::OpaqueArguments &args, nanobind::object &pyArg, unsigned pos) {
         return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
       });
   return wrap(cleanLowerToCodegenKernel(mod, args));
@@ -1065,21 +1086,21 @@ static std::size_t get_launch_args_required(MlirModule module,
   return result;
 }
 
-void cudaq::bindAltLaunchKernel(py::module &mod,
+void cudaq::bindAltLaunchKernel(nanobind::module_ &mod,
                                 std::function<std::string()> &&getTL) {
   getTransportLayer = std::move(getTL);
 
-  py::class_<cudaq::CompiledModule>(mod, "CompiledModule")
-      .def_property_readonly(
+  nanobind::class_<cudaq::CompiledModule>(mod, "CompiledModule")
+      .def_prop_ro(
           "entry_point",
           [](const cudaq::CompiledModule &ck) {
             return reinterpret_cast<std::uintptr_t>(
                 ck.getJit().getEntryPoint());
           },
           "The address of the JIT-compiled entry point.")
-      .def_property_readonly("is_fully_specialized",
-                             &cudaq::CompiledModule::isFullySpecialized,
-                             "Whether all arguments have been specialized.");
+      .def_prop_ro("is_fully_specialized",
+                   &cudaq::CompiledModule::isFullySpecialized,
+                   "Whether all arguments have been specialized.");
 
   mod.def("lower_to_codegen", lower_to_codegen,
           "Lower a kernel module to CC dialect. Never launches the kernel.");
@@ -1100,9 +1121,9 @@ void cudaq::bindAltLaunchKernel(py::module &mod,
 
   mod.def(
       "storePointerToStateData",
-      [](const std::string &name, const std::string &hash, py::buffer data,
-         simulation_precision precision) {
-        auto ptr = data.request().ptr;
+      [](const std::string &name, const std::string &hash,
+         nanobind::ndarray<> data, simulation_precision precision) {
+        auto ptr = data.data();
         stateStorage->insert({hash, PyStateVectorData{ptr, precision, name}});
       },
       "Store qalloc state initialization array data.");
@@ -1124,8 +1145,9 @@ void cudaq::bindAltLaunchKernel(py::module &mod,
 
   mod.def(
       "storePointerToCudaqState",
-      [](const std::string &name, const std::string &hash, py::object data) {
-        auto state = data.cast<cudaq::state>();
+      [](const std::string &name, const std::string &hash,
+         nanobind::object data) {
+        auto state = nanobind::cast<cudaq::state>(data);
         cudaqStateStorage->insert({hash, PyStateData{state, name}});
       },
       "Store qalloc state initialization states.");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.h b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
index dcf74abe027..8e1cc9a98cb 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.h
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
@@ -10,17 +10,17 @@
 
 #include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/algorithms/run.h"
+#include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
 #include "utils/PyTypes.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 #include <string>
 #include <vector>
 
-namespace py = pybind11;
-
 namespace cudaq {
 
 /// @brief Set current architecture's data layout attribute on a module.
@@ -28,25 +28,28 @@ void setDataLayout(MlirModule module);
 
 /// @brief Create a new OpaqueArguments pointer and pack the
 /// python arguments in it. Clients must delete the memory.
-OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
+OpaqueArguments *toOpaqueArgs(nanobind::args &args, MlirModule mod,
                               const std::string &name);
 
 // FIXME: Document!
 std::size_t byteSize(mlir::Type ty);
 
 /// @brief Convert raw return of kernel to python object.
-py::object convertResult(mlir::ModuleOp module, mlir::Type ty, char *data);
+nanobind::object convertResult(mlir::ModuleOp module, mlir::Type ty,
+                               char *data);
 
 /// Create python bindings for C++ code in this compilation unit.
-void bindAltLaunchKernel(py::module &mod, std::function<std::string()> &&);
+void bindAltLaunchKernel(nanobind::module_ &mod,
+                         std::function<std::string()> &&);
 
 /// Launch the kernel \p kernelName from module \p module. \p runtimeArgs are
 /// the python arguments to the kernel. Pre-condition: all arguments must be
 /// resolved at this `callsite` \e prior to launching this module. In particular
 /// this means \p module is ready for beta reduction of callables. The return
 /// type is obtained from the kernel's FuncOp. \p module must be modifiable.
-py::object marshal_and_launch_module(const std::string &kernelName,
-                                     MlirModule module, py::args runtimeArgs);
+nanobind::object marshal_and_launch_module(const std::string &kernelName,
+                                           MlirModule module,
+                                           nanobind::args runtimeArgs);
 
 /// Pure C++ code that launches a kernel. Argument marshaling and result
 /// unmarshalling is \e not performed.
@@ -55,7 +58,8 @@ KernelThunkResultType clean_launch_module(const std::string &kernelName,
                                           OpaqueArguments &args);
 
 OpaqueArguments
-marshal_arguments_for_module_launch(mlir::ModuleOp mod, py::args runtimeArgs,
+marshal_arguments_for_module_launch(mlir::ModuleOp mod,
+                                    nanobind::args runtimeArgs,
                                     mlir::func::FuncOp kernelFunc);
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_execution_manager.cpp b/python/runtime/cudaq/qis/py_execution_manager.cpp
index 467c0d2c36b..13cf01df6cb 100644
--- a/python/runtime/cudaq/qis/py_execution_manager.cpp
+++ b/python/runtime/cudaq/qis/py_execution_manager.cpp
@@ -8,14 +8,13 @@
 
 #include "cudaq/qis/execution_manager.h"
 #include <fmt/core.h>
-#include <pybind11/complex.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace cudaq {
 
-void bindExecutionManager(py::module &mod) {
+void bindExecutionManager(nanobind::module_ &mod) {
 
   mod.def(
       "applyQuantumOperation",
@@ -29,9 +28,9 @@ void bindExecutionManager(py::module &mod) {
                        [](auto &&el) { return cudaq::QuditInfo(2, el); });
         cudaq::getExecutionManager()->apply(name, params, c, t, isAdjoint, op);
       },
-      py::arg("name"), py::arg("params"), py::arg("controls"),
-      py::arg("targets"), py::arg("isAdjoint") = false,
-      py::arg("op") = cudaq::spin_op::identity());
+      nanobind::arg("name"), nanobind::arg("params"), nanobind::arg("controls"),
+      nanobind::arg("targets"), nanobind::arg("isAdjoint") = false,
+      nanobind::arg("op") = cudaq::spin_op::identity());
 
   mod.def("startAdjointRegion",
           []() { cudaq::getExecutionManager()->startAdjointRegion(); });
@@ -50,6 +49,6 @@ void bindExecutionManager(py::module &mod) {
         return cudaq::getExecutionManager()->measure(cudaq::QuditInfo(2, id),
                                                      regName);
       },
-      py::arg("qubit"), py::arg("register_name") = "");
+      nanobind::arg("qubit"), nanobind::arg("register_name") = "");
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_execution_manager.h b/python/runtime/cudaq/qis/py_execution_manager.h
index d562fe25946..4893dff9f6b 100644
--- a/python/runtime/cudaq/qis/py_execution_manager.h
+++ b/python/runtime/cudaq/qis/py_execution_manager.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindExecutionManager(py::module &mod);
+void bindExecutionManager(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_pauli_word.cpp b/python/runtime/cudaq/qis/py_pauli_word.cpp
index 923732cfc27..c8388e9153e 100644
--- a/python/runtime/cudaq/qis/py_pauli_word.cpp
+++ b/python/runtime/cudaq/qis/py_pauli_word.cpp
@@ -8,16 +8,18 @@
 
 #include "py_pauli_word.h"
 #include "cudaq/qis/pauli_word.h"
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
 
 namespace cudaq {
 
-void bindPauliWord(py::module &mod) {
+void bindPauliWord(nanobind::module_ &mod) {
 
-  py::class_<pauli_word>(mod, "pauli_word",
-                         "The `pauli_word` is a thin wrapper on a Pauli tensor "
-                         "product string, e.g. `XXYZ` on 4 qubits.")
-      .def(py::init<>())
-      .def(py::init<const std::string>());
+  nanobind::class_<pauli_word>(
+      mod, "pauli_word",
+      "The `pauli_word` is a thin wrapper on a Pauli tensor "
+      "product string, e.g. `XXYZ` on 4 qubits.")
+      .def(nanobind::init<>())
+      .def(nanobind::init<const std::string>());
 }
 } // namespace cudaq
diff --git a/python/runtime/cudaq/qis/py_pauli_word.h b/python/runtime/cudaq/qis/py_pauli_word.h
index 5ff9c2581a8..fc48d8a6230 100644
--- a/python/runtime/cudaq/qis/py_pauli_word.h
+++ b/python/runtime/cudaq/qis/py_pauli_word.h
@@ -8,11 +8,9 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 /// @brief Bind the Quantum Instruction Set.
-void bindPauliWord(py::module &mod);
+void bindPauliWord(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/runtime/cudaq/target/py_runtime_target.cpp b/python/runtime/cudaq/target/py_runtime_target.cpp
index 5f58c95d293..1eabed728b4 100644
--- a/python/runtime/cudaq/target/py_runtime_target.cpp
+++ b/python/runtime/cudaq/target/py_runtime_target.cpp
@@ -13,9 +13,11 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq/target_control.h"
 #include <functional>
-#include <pybind11/functional.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 #include <shared_mutex>
 
 namespace {
@@ -52,55 +54,55 @@ void onTargetChange(const cudaq::RuntimeTarget &newTarget) {
 namespace cudaq {
 
 std::map<std::string, std::string>
-parseTargetKwArgs(const py::kwargs &extraConfig) {
+parseTargetKwArgs(const nanobind::kwargs &extraConfig) {
   if (extraConfig.contains("options"))
     throw std::runtime_error("The keyword `options` argument is not supported "
                              "in cudaq.set_target(). Please use the keyword "
                              "`option` in order to set the target options.");
   std::map<std::string, std::string> config;
-  for (auto &[key, value] : extraConfig) {
+  for (auto [key, value] : extraConfig) {
     std::string strValue = "";
-    if (py::isinstance<py::bool_>(value))
-      strValue = value.cast<py::bool_>() ? "true" : "false";
-    else if (py::isinstance<py::str>(value))
-      strValue = value.cast<std::string>();
-    else if (py::isinstance<py::int_>(value))
-      strValue = std::to_string(value.cast<int>());
+    if (nanobind::isinstance<nanobind::bool_>(value))
+      strValue = nanobind::cast<bool>(value) ? "true" : "false";
+    else if (nanobind::isinstance<nanobind::str>(value))
+      strValue = nanobind::cast<std::string>(value);
+    else if (nanobind::isinstance<nanobind::int_>(value))
+      strValue = std::to_string(nanobind::cast<int>(value));
     else
       throw std::runtime_error(
           "QPU kwargs config value must be cast-able to a string.");
 
     // Ignore empty parameter values
     if (!strValue.empty())
-      config.emplace(key.cast<std::string>(), strValue);
+      config.emplace(nanobind::cast<std::string>(key), strValue);
   }
   return config;
 }
 
-void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
+void bindRuntimeTarget(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
 
-  py::enum_<simulation_precision>(
+  nanobind::enum_<simulation_precision>(
       mod, "SimulationPrecision",
       "Enumeration describing the precision of the underlying simulation.")
       .value("fp32", simulation_precision::fp32)
       .value("fp64", simulation_precision::fp64);
 
-  py::class_<cudaq::RuntimeTarget>(
+  nanobind::class_<cudaq::RuntimeTarget>(
       mod, "Target",
       "The `cudaq.Target` represents the underlying infrastructure that "
       "CUDA-Q kernels will execute on. Instances of `cudaq.Target` describe "
       "what simulator they may leverage, the quantum_platform required for "
       "execution, and a description for the target.")
-      .def_readonly("name", &cudaq::RuntimeTarget::name,
-                    "The name of the `cudaq.Target`.")
-      .def_readonly("simulator", &cudaq::RuntimeTarget::simulatorName,
-                    "The name of the simulator this `cudaq.Target` leverages. "
-                    "This will be empty for physical QPUs.")
-      .def_readonly("platform", &cudaq::RuntimeTarget::platformName,
-                    "The name of the quantum_platform implementation this "
-                    "`cudaq.Target` leverages.")
-      .def_readonly("description", &cudaq::RuntimeTarget::description,
-                    "A string describing the features for this `cudaq.Target`.")
+      .def_ro("name", &cudaq::RuntimeTarget::name,
+              "The name of the `cudaq.Target`.")
+      .def_ro("simulator", &cudaq::RuntimeTarget::simulatorName,
+              "The name of the simulator this `cudaq.Target` leverages. "
+              "This will be empty for physical QPUs.")
+      .def_ro("platform", &cudaq::RuntimeTarget::platformName,
+              "The name of the quantum_platform implementation this "
+              "`cudaq.Target` leverages.")
+      .def_ro("description", &cudaq::RuntimeTarget::description,
+              "A string describing the features for this `cudaq.Target`.")
       .def(
           "num_qpus",
           [](cudaq::RuntimeTarget &_) { return cudaq::platform_num_qpus(); },
@@ -165,7 +167,7 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       "Return all available `cudaq.Target` instances on the current system.");
   mod.def(
       "set_target",
-      [&](const cudaq::RuntimeTarget &target, py::kwargs extraConfig) {
+      [&](const cudaq::RuntimeTarget &target, nanobind::kwargs extraConfig) {
         auto config = parseTargetKwArgs(extraConfig);
         holder.setTarget(target.name, config);
         onTargetChange(target);
@@ -175,7 +177,7 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       "kwargs.");
   mod.def(
       "set_target",
-      [&](const std::string &name, py::kwargs extraConfig) {
+      [&](const std::string &name, nanobind::kwargs extraConfig) {
         auto config = parseTargetKwArgs(extraConfig);
         holder.setTarget(name, config);
         onTargetChange(holder.getTarget());
@@ -209,10 +211,12 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       },
       "Unregister a callback identified by the input identifier.");
 
-  py::module_::import("atexit").attr("register")(py::cpp_function([]() {
-    // Perform cleanup of registered callbacks, which might be Python objects.
-    g_callbacks.clear();
-  }));
+  nanobind::module_::import_("atexit").attr("register")(
+      nanobind::cpp_function([]() {
+        // Perform cleanup of registered callbacks, which might be Python
+        // objects.
+        g_callbacks.clear();
+      }));
 }
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/target/py_runtime_target.h b/python/runtime/cudaq/target/py_runtime_target.h
index 1d405033241..672ef6c298d 100644
--- a/python/runtime/cudaq/target/py_runtime_target.h
+++ b/python/runtime/cudaq/target/py_runtime_target.h
@@ -8,14 +8,12 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 
 class LinkedLibraryHolder;
 
-void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder);
+void bindRuntimeTarget(nanobind::module_ &mod, LinkedLibraryHolder &holder);
 
 } // namespace cudaq
diff --git a/python/runtime/cudaq/target/py_testing_utils.cpp b/python/runtime/cudaq/target/py_testing_utils.cpp
index 423677eebeb..ccfff9e2c7a 100644
--- a/python/runtime/cudaq/target/py_testing_utils.cpp
+++ b/python/runtime/cudaq/target/py_testing_utils.cpp
@@ -11,9 +11,9 @@
 #include "cudaq.h"
 #include "cudaq/platform.h"
 #include "nvqir/CircuitSimulator.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 namespace nvqir {
 void toggleDynamicQubitManagement();
@@ -21,7 +21,7 @@ void toggleDynamicQubitManagement();
 
 namespace cudaq {
 
-void bindTestUtils(py::module &mod, LinkedLibraryHolder &holder) {
+void bindTestUtils(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
   auto testingSubmodule = mod.def_submodule("testing");
 
   testingSubmodule.def(
@@ -34,7 +34,7 @@ void bindTestUtils(py::module &mod, LinkedLibraryHolder &holder) {
         auto simName = holder.getTarget().simulatorName;
         return holder.getSimulator(simName)->allocateQubits(numQubits);
       },
-      py::arg("numQubits"));
+      nanobind::arg("numQubits"));
 
   testingSubmodule.def("deallocateQubits",
                        [&](const std::vector<std::size_t> &qubits) {
diff --git a/python/runtime/cudaq/target/py_testing_utils.h b/python/runtime/cudaq/target/py_testing_utils.h
index deb53e59e8e..593022f95fd 100644
--- a/python/runtime/cudaq/target/py_testing_utils.h
+++ b/python/runtime/cudaq/target/py_testing_utils.h
@@ -8,15 +8,13 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
 
 class LinkedLibraryHolder;
 
 /// @brief Bind test utilities needed for mock QPU QIR profile simulation
-void bindTestUtils(py::module &mod, LinkedLibraryHolder &holder);
+void bindTestUtils(nanobind::module_ &mod, LinkedLibraryHolder &holder);
 
 } // namespace cudaq
diff --git a/python/runtime/interop/CMakeLists.txt b/python/runtime/interop/CMakeLists.txt
index 5063c5858de..c20b2d8390a 100644
--- a/python/runtime/interop/CMakeLists.txt
+++ b/python/runtime/interop/CMakeLists.txt
@@ -10,10 +10,13 @@ add_compile_options(-Wno-attributes)
 add_library(cudaq-python-interop SHARED PythonCppInterop.cpp)
 target_include_directories(cudaq-python-interop PRIVATE
     ${PYTHON_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIRS}
 )
-target_link_libraries(cudaq-python-interop PRIVATE pybind11::module cudaq)
-install (FILES PythonCppInterop.h DESTINATION include/cudaq/python/)
+if (SKBUILD)
+  target_link_libraries(cudaq-python-interop PRIVATE nanobind-static Python::Module cudaq)
+else()
+  target_link_libraries(cudaq-python-interop PRIVATE nanobind-static Python::Python cudaq)
+endif()
+install (FILES PythonCppInterop.h PythonCppInteropDecls.h DESTINATION include/cudaq/python/)
 
 install(TARGETS cudaq-python-interop EXPORT cudaq-python-interop-targets DESTINATION lib)
 
diff --git a/python/runtime/interop/PythonCppInterop.h b/python/runtime/interop/PythonCppInterop.h
index 82f90eb73c7..9a24a740a7f 100644
--- a/python/runtime/interop/PythonCppInterop.h
+++ b/python/runtime/interop/PythonCppInterop.h
@@ -7,10 +7,10 @@
  ******************************************************************************/
 #pragma once
 
+#include "PythonCppInteropDecls.h"
 #include "cudaq/qis/qkernel.h"
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
 
 namespace cudaq::python {
 
@@ -20,8 +20,8 @@ class CppPyKernelDecorator {
   /// The constructor.
   /// @param obj A kernel decorator Python object.
   /// @throw std::runtime_error if the object is not a valid kernel decorator.
-  CppPyKernelDecorator(py::object obj) : kernel(obj) {
-    if (!py::hasattr(obj, "qkeModule"))
+  CppPyKernelDecorator(nanobind::object obj) : kernel(obj) {
+    if (!nanobind::hasattr(obj, "qkeModule"))
       throw std::runtime_error("Invalid python kernel object passed, must be "
                                "annotated with cudaq.kernel");
   }
@@ -52,9 +52,9 @@ class CppPyKernelDecorator {
   }
 
 private:
-  py::object kernel;
+  nanobind::object kernel;
   // Hold on to the CompiledModule, it keeps the JIT engine alive.
-  py::object compiledKernel;
+  nanobind::object compiledKernel;
 
   template <typename... As>
   void *getKernelHelper(bool isEntryPoint, As... as) {
@@ -62,7 +62,7 @@ class CppPyKernelDecorator {
     compiledKernel =
         kernel.attr("beta_reduction")(isEntryPoint, std::forward<As>(as)...);
     auto entryPointAddr =
-        compiledKernel.attr("entry_point").cast<std::uintptr_t>();
+        nanobind::cast<std::uintptr_t>(compiledKernel.attr("entry_point"));
     // Set lsb to 1 to denote this is NOT a C++ kernel.
     auto *p = reinterpret_cast<void *>(
         static_cast<std::intptr_t>(entryPointAddr) | 1);
@@ -76,86 +76,13 @@ class CppPyKernelDecorator {
 /// (synthesized) into the kernel and cannot be changed by the algorithm.
 template <typename KT, typename ALGO, typename... As>
   requires QKernelType<KT> && std::invocable<ALGO, KT>
-auto launch_specialized_py_decorator(py::object qern, ALGO algo, As... as) {
+auto launch_specialized_py_decorator(nanobind::object qern, ALGO algo,
+                                     As... as) {
   cudaq::python::CppPyKernelDecorator decorator(qern);
   auto entryPoint = decorator.getDirectKernelCall<KT>(std::forward<As>(as)...);
   return algo(std::move(entryPoint));
 }
 
-/// @brief Extracts the kernel name from an input MLIR string.
-/// @param input The input string containing the kernel name.
-/// @return The extracted kernel name.
-std::string getKernelName(const std::string &input);
-
-/// @brief Extracts a sub-string from an input string based on start and end
-/// delimiters.
-/// @param input The input string to extract from.
-/// @param startStr The starting delimiter.
-/// @param endStr The ending delimiter.
-/// @return The extracted sub-string.
-std::string extractSubstring(const std::string &input,
-                             const std::string &startStr,
-                             const std::string &endStr);
-
-/// @brief Retrieves the MLIR code and mangled kernel name for a given
-/// user-level kernel name.
-/// @param name The name of the kernel.
-/// @return A tuple containing the MLIR code and the kernel name.
-std::tuple<std::string, std::string>
-getMLIRCodeAndName(const std::string &name, const std::string mangled = "");
-
-/// @brief Register a C++ device kernel with the given module and name
-/// @param module The name of the module containing the kernel
-/// @param name The name of the kernel to register
-void registerDeviceKernel(const std::string &module, const std::string &name,
-                          const std::string &mangled);
-
-/// @brief Retrieve the module and name of a registered device kernel
-/// @param compositeName The composite name of the kernel (module.name)
-/// @return A tuple containing the module name and kernel name
-std::tuple<std::string, std::string>
-getDeviceKernel(const std::string &compositeName);
-
-bool isRegisteredDeviceModule(const std::string &compositeName);
-
-template <typename T>
-constexpr bool is_const_reference_v =
-    std::is_reference_v<T> && std::is_const_v<std::remove_reference_t<T>>;
-
-template <typename T>
-struct TypeMangler {
-  static std::string mangle() {
-    std::string mangledName = typeid(T).name();
-    if constexpr (is_const_reference_v<T>) {
-      mangledName = "RK" + mangledName;
-    }
-    return mangledName;
-  }
-};
-
-template <typename... Args>
-inline std::string getMangledArgsString() {
-  std::string result;
-  (result += ... += TypeMangler<Args>::mangle());
-
-  // Remove any namespace cudaq text
-  std::string search = "N5cudaq";
-  std::string replace = "";
-
-  size_t pos = result.find(search);
-  while (pos != std::string::npos) {
-    result.replace(pos, search.length(), replace);
-    pos = result.find(search, pos + replace.length());
-  }
-
-  return result;
-}
-
-template <>
-inline std::string getMangledArgsString<>() {
-  return {};
-}
-
 /// @brief Add a C++ device kernel that is usable from CUDA-Q Python.
 /// @tparam Signature The function signature of the kernel
 /// @param m The Python module to add the kernel to
@@ -163,7 +90,7 @@ inline std::string getMangledArgsString<>() {
 /// @param kernelName The name of the kernel
 /// @param docstring The documentation string for the kernel
 template <typename... Signature>
-void addDeviceKernelInterop(py::module_ &m, const std::string &modName,
+void addDeviceKernelInterop(nanobind::module_ &m, const std::string &modName,
                             const std::string &kernelName,
                             const std::string &docstring) {
 
@@ -171,16 +98,16 @@ void addDeviceKernelInterop(py::module_ &m, const std::string &modName,
 
   // FIXME Maybe Add replacement options (i.e., _pycudaq -> cudaq)
 
-  py::module_ sub;
-  if (py::hasattr(m, modName.c_str()))
-    sub = m.attr(modName.c_str()).cast<py::module_>();
-  else
-    sub = m.def_submodule(modName.c_str());
+  nanobind::module_ sub =
+      nanobind::hasattr(m, modName.c_str())
+          ? nanobind::cast<nanobind::module_>(m.attr(modName.c_str()))
+          : m.def_submodule(modName.c_str());
 
   sub.def(
       kernelName.c_str(), [](Signature...) {}, docstring.c_str());
-  cudaq::python::registerDeviceKernel(sub.attr("__name__").cast<std::string>(),
-                                      kernelName, mangledArgs);
+  cudaq::python::registerDeviceKernel(
+      nanobind::cast<std::string>(sub.attr("__name__")), kernelName,
+      mangledArgs);
   return;
 }
 } // namespace cudaq::python
diff --git a/python/runtime/interop/PythonCppInteropDecls.h b/python/runtime/interop/PythonCppInteropDecls.h
new file mode 100644
index 00000000000..9bb637807d1
--- /dev/null
+++ b/python/runtime/interop/PythonCppInteropDecls.h
@@ -0,0 +1,91 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+
+namespace cudaq::python {
+
+/// @brief Extracts the kernel name from an input MLIR string.
+/// @param input The input string containing the kernel name.
+/// @return The extracted kernel name.
+std::string getKernelName(const std::string &input);
+
+/// @brief Extracts a sub-string from an input string based on start and end
+/// delimiters.
+/// @param input The input string to extract from.
+/// @param startStr The starting delimiter.
+/// @param endStr The ending delimiter.
+/// @return The extracted sub-string.
+std::string extractSubstring(const std::string &input,
+                             const std::string &startStr,
+                             const std::string &endStr);
+
+/// @brief Retrieves the MLIR code and mangled kernel name for a given
+/// user-level kernel name.
+/// @param name The name of the kernel.
+/// @return A tuple containing the MLIR code and the kernel name.
+std::tuple<std::string, std::string>
+getMLIRCodeAndName(const std::string &name, const std::string mangled = "");
+
+/// @brief Register a C++ device kernel with the given module and name
+/// @param module The name of the module containing the kernel
+/// @param name The name of the kernel to register
+void registerDeviceKernel(const std::string &module, const std::string &name,
+                          const std::string &mangled);
+
+/// @brief Retrieve the module and name of a registered device kernel
+/// @param compositeName The composite name of the kernel (module.name)
+/// @return A tuple containing the module name and kernel name
+std::tuple<std::string, std::string>
+getDeviceKernel(const std::string &compositeName);
+
+bool isRegisteredDeviceModule(const std::string &compositeName);
+
+template <typename T>
+constexpr bool is_const_reference_v =
+    std::is_reference_v<T> && std::is_const_v<std::remove_reference_t<T>>;
+
+template <typename T>
+struct TypeMangler {
+  static std::string mangle() {
+    std::string mangledName = typeid(T).name();
+    if constexpr (is_const_reference_v<T>) {
+      mangledName = "RK" + mangledName;
+    }
+    return mangledName;
+  }
+};
+
+template <typename... Args>
+inline std::string getMangledArgsString() {
+  std::string result;
+  (result += ... += TypeMangler<Args>::mangle());
+
+  // Remove any namespace cudaq text
+  std::string search = "N5cudaq";
+  std::string replace = "";
+
+  size_t pos = result.find(search);
+  while (pos != std::string::npos) {
+    result.replace(pos, search.length(), replace);
+    pos = result.find(search, pos + replace.length());
+  }
+
+  return result;
+}
+
+template <>
+inline std::string getMangledArgsString<>() {
+  return {};
+}
+
+} // namespace cudaq::python
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index f1ec32ac531..35f0b8cc217 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -16,20 +16,19 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/InitAllPasses.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
+#include "utils/NanobindAdaptors.h"
 #include "mlir/InitAllDialects.h"
 #include <fmt/core.h>
-#include <pybind11/complex.h>
-#include <pybind11/stl.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
-using namespace mlir::python::adaptors;
 using namespace mlir;
 
 namespace cudaq {
 static bool registered = false;
 
-void registerQuakeDialectAndTypes(py::module &m) {
+void registerQuakeDialectAndTypes(nanobind::module_ &m) {
   auto quakeMod = m.def_submodule("quake");
 
   quakeMod.def(
@@ -45,51 +44,52 @@ void registerQuakeDialectAndTypes(py::module &m) {
           registered = true;
         }
       },
-      py::arg("load") = true, py::arg("context") = py::none());
+      nanobind::arg("load") = true,
+      nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "RefType",
       [](MlirType type) { return unwrap(type).isa<quake::RefType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(quake::RefType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "MeasureType",
       [](MlirType type) { return unwrap(type).isa<quake::MeasureType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(quake::MeasureType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "MeasurementsType",
       [](MlirType type) { return unwrap(type).isa<quake::MeasurementsType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, std::size_t size, MlirContext context) {
+          [](nanobind::object cls, std::size_t size, MlirContext context) {
             return wrap(quake::MeasurementsType::get(unwrap(context), size));
           },
-          py::arg("cls"),
-          py::arg("size") = quake::MeasurementsType::kDynamicSize,
-          py::arg("context") = py::none());
+          nanobind::arg("cls"),
+          nanobind::arg("size") = quake::MeasurementsType::kDynamicSize,
+          nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "VeqType",
       [](MlirType type) { return unwrap(type).isa<quake::VeqType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, std::size_t size, MlirContext context) {
+          [](nanobind::object cls, std::size_t size, MlirContext context) {
             return wrap(quake::VeqType::get(unwrap(context), size));
           },
-          py::arg("cls"),
-          py::arg("size") = std::numeric_limits<std::size_t>::max(),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"),
+          nanobind::arg("size") = std::numeric_limits<std::size_t>::max(),
+          nanobind::arg("context") = nanobind::none())
       .def_staticmethod(
           "hasSpecifiedSize",
           [](MlirType type) {
@@ -100,7 +100,7 @@ void registerQuakeDialectAndTypes(py::module &m) {
 
             return veqTy.hasSpecifiedSize();
           },
-          py::arg("veqTypeInstance"))
+          nanobind::arg("veqTypeInstance"))
       .def_staticmethod(
           "getSize",
           [](MlirType type) {
@@ -111,49 +111,51 @@ void registerQuakeDialectAndTypes(py::module &m) {
 
             return veqTy.getSize();
           },
-          py::arg("veqTypeInstance"));
+          nanobind::arg("veqTypeInstance"));
 
   quakeMod.def(
       "isConstantQuantumRefType",
       [](MlirType type) {
         return quake::isConstantQuantumRefType(unwrap(type));
       },
-      py::arg("type"));
+      nanobind::arg("type"));
 
   quakeMod.def(
       "getAllocationSize",
       [](MlirType type) { return quake::getAllocationSize(unwrap(type)); },
-      py::arg("type"));
+      nanobind::arg("type"));
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "StruqType",
       [](MlirType type) { return unwrap(type).isa<quake::StruqType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, py::list aggregateTypes, MlirContext context) {
+          [](nanobind::object cls, nanobind::list aggregateTypes,
+             MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(quake::StruqType::get(unwrap(context), inTys));
           },
-          py::arg("cls"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getNamed",
-          [](py::object cls, const std::string &name, py::list aggregateTypes,
-             MlirContext context) {
+          [](nanobind::object cls, const std::string &name,
+             nanobind::list aggregateTypes, MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(quake::StruqType::get(unwrap(context), name, inTys));
           },
-          py::arg("cls"), py::arg("name"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("name"),
+          nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getTypes",
-          [](py::object cls, MlirType structTy) {
+          [](nanobind::object cls, MlirType structTy) {
             auto ty = dyn_cast<quake::StruqType>(unwrap(structTy));
             if (!ty)
               throw std::runtime_error(
@@ -164,7 +166,7 @@ void registerQuakeDialectAndTypes(py::module &m) {
               ret.push_back(wrap(t));
             return ret;
           })
-      .def_classmethod("getName", [](py::object cls, MlirType structTy) {
+      .def_classmethod("getName", [](nanobind::object cls, MlirType structTy) {
         auto ty = dyn_cast<quake::StruqType>(unwrap(structTy));
         if (!ty)
           throw std::runtime_error(
@@ -174,7 +176,7 @@ void registerQuakeDialectAndTypes(py::module &m) {
       });
 }
 
-void registerCCDialectAndTypes(py::module &m) {
+void registerCCDialectAndTypes(nanobind::module_ &m) {
 
   auto ccMod = m.def_submodule("cc");
 
@@ -187,34 +189,35 @@ void registerCCDialectAndTypes(py::module &m) {
           mlirDialectHandleLoadDialect(ccHandle, context);
         }
       },
-      py::arg("load") = true, py::arg("context") = py::none());
+      nanobind::arg("load") = true,
+      nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "CharspanType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::CharspanType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(cudaq::cc::CharspanType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "StateType",
       [](MlirType type) { return unwrap(type).isa<quake::StateType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext context) {
+          [](nanobind::object cls, MlirContext context) {
             return wrap(quake::StateType::get(unwrap(context)));
           },
-          py::arg("cls"), py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "PointerType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::PointerType>(); })
       .def_classmethod(
           "getElementType",
-          [](py::object cls, MlirType type) {
+          [](nanobind::object cls, MlirType type) {
             auto ty = unwrap(type);
             auto casted = dyn_cast<cudaq::cc::PointerType>(ty);
             if (!casted)
@@ -225,19 +228,19 @@ void registerCCDialectAndTypes(py::module &m) {
           })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirType elementType, MlirContext context) {
+          [](nanobind::object cls, MlirType elementType, MlirContext context) {
             return wrap(cudaq::cc::PointerType::get(unwrap(context),
                                                     unwrap(elementType)));
           },
-          py::arg("cls"), py::arg("elementType"),
-          py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("elementType"),
+          nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "ArrayType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::ArrayType>(); })
       .def_classmethod(
           "getElementType",
-          [](py::object cls, MlirType type) {
+          [](nanobind::object cls, MlirType type) {
             auto ty = unwrap(type);
             auto casted = dyn_cast<cudaq::cc::ArrayType>(ty);
             if (!casted)
@@ -248,45 +251,47 @@ void registerCCDialectAndTypes(py::module &m) {
           })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirType elementType, std::int64_t size,
+          [](nanobind::object cls, MlirType elementType, std::int64_t size,
              MlirContext context) {
             return wrap(cudaq::cc::ArrayType::get(unwrap(context),
                                                   unwrap(elementType), size));
           },
-          py::arg("cls"), py::arg("elementType"),
-          py::arg("size") = std::numeric_limits<std::int64_t>::min(),
-          py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("elementType"),
+          nanobind::arg("size") = std::numeric_limits<std::int64_t>::min(),
+          nanobind::arg("context") = nanobind::none());
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "StructType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::StructType>(); })
       .def_classmethod(
           "get",
-          [](py::object cls, py::list aggregateTypes, MlirContext context) {
+          [](nanobind::object cls, nanobind::list aggregateTypes,
+             MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(cudaq::cc::StructType::get(unwrap(context), inTys));
           },
-          py::arg("cls"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getNamed",
-          [](py::object cls, const std::string &name, py::list aggregateTypes,
-             MlirContext context) {
+          [](nanobind::object cls, const std::string &name,
+             nanobind::list aggregateTypes, MlirContext context) {
             SmallVector<Type> inTys;
-            for (auto &t : aggregateTypes)
-              inTys.push_back(unwrap(t.cast<MlirType>()));
+            for (auto t : aggregateTypes)
+              inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
 
             return wrap(
                 cudaq::cc::StructType::get(unwrap(context), name, inTys));
           },
-          py::arg("cls"), py::arg("name"), py::arg("aggregateTypes"),
-          py::arg("context") = py::none())
+          nanobind::arg("cls"), nanobind::arg("name"),
+          nanobind::arg("aggregateTypes"),
+          nanobind::arg("context") = nanobind::none())
       .def_classmethod(
           "getTypes",
-          [](py::object cls, MlirType structTy) {
+          [](nanobind::object cls, MlirType structTy) {
             auto ty = dyn_cast<cudaq::cc::StructType>(unwrap(structTy));
             if (!ty)
               throw std::runtime_error(
@@ -297,7 +302,7 @@ void registerCCDialectAndTypes(py::module &m) {
               ret.push_back(wrap(t));
             return ret;
           })
-      .def_classmethod("getName", [](py::object cls, MlirType structTy) {
+      .def_classmethod("getName", [](nanobind::object cls, MlirType structTy) {
         auto ty = dyn_cast<cudaq::cc::StructType>(unwrap(structTy));
         if (!ty)
           throw std::runtime_error(
@@ -306,38 +311,40 @@ void registerCCDialectAndTypes(py::module &m) {
         return ty.getName().getValue().str();
       });
 
-  mlir_type_subclass(
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "CallableType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::CallableType>(); })
       .def_classmethod("get",
-                       [](py::object cls, MlirContext context, py::list inTypes,
-                          py::list resTypes) {
-                         // Pybind builder: make the builder for this type look
-                         // like that of a FunctionType.
+                       [](nanobind::object cls, MlirContext context,
+                          nanobind::list inTypes, nanobind::list resTypes) {
+                         // Nanobind builder: make the builder for this type
+                         // look like that of a FunctionType.
                          SmallVector<Type> inTys;
-                         for (auto &t : inTypes)
-                           inTys.push_back(unwrap(t.cast<MlirType>()));
+                         for (auto t : inTypes)
+                           inTys.push_back(unwrap(nanobind::cast<MlirType>(t)));
                          SmallVector<Type> resTys;
-                         for (auto &t : resTypes)
-                           resTys.push_back(unwrap(t.cast<MlirType>()));
+                         for (auto t : resTypes)
+                           resTys.push_back(
+                               unwrap(nanobind::cast<MlirType>(t)));
 
                          auto *ctx = unwrap(context);
                          return wrap(cudaq::cc::CallableType::get(
                              ctx, FunctionType::get(ctx, inTys, resTys)));
                        })
-      .def_classmethod("getFunctionType", [](py::object cls, MlirType type) {
-        auto callTy = dyn_cast<cudaq::cc::CallableType>(unwrap(type));
-        if (!callTy)
-          throw std::runtime_error("must be a cc.callable type!");
-        return wrap(callTy.getSignature());
-      });
-
-  mlir_type_subclass(
+      .def_classmethod(
+          "getFunctionType", [](nanobind::object cls, MlirType type) {
+            auto callTy = dyn_cast<cudaq::cc::CallableType>(unwrap(type));
+            if (!callTy)
+              throw std::runtime_error("must be a cc.callable type!");
+            return wrap(callTy.getSignature());
+          });
+
+  mlir::python::nanobind_adaptors::mlir_type_subclass(
       ccMod, "StdvecType",
       [](MlirType type) { return unwrap(type).isa<cudaq::cc::StdvecType>(); })
       .def_classmethod(
           "getElementType",
-          [](py::object cls, MlirType type) {
+          [](nanobind::object cls, MlirType type) {
             auto ty = unwrap(type);
             auto casted = dyn_cast<cudaq::cc::StdvecType>(ty);
             if (!casted)
@@ -348,15 +355,15 @@ void registerCCDialectAndTypes(py::module &m) {
           })
       .def_classmethod(
           "get",
-          [](py::object cls, MlirType elementType, MlirContext context) {
+          [](nanobind::object cls, MlirType elementType, MlirContext context) {
             return wrap(cudaq::cc::StdvecType::get(unwrap(context),
                                                    unwrap(elementType)));
           },
-          py::arg("cls"), py::arg("elementType"),
-          py::arg("context") = py::none());
+          nanobind::arg("cls"), nanobind::arg("elementType"),
+          nanobind::arg("context") = nanobind::none());
 }
 
-void bindRegisterDialects(py::module &mod) {
+void bindRegisterDialects(nanobind::module_ &mod) {
   registerQuakeDialectAndTypes(mod);
   registerCCDialectAndTypes(mod);
 
diff --git a/python/runtime/mlir/py_register_dialects.h b/python/runtime/mlir/py_register_dialects.h
index a81771b4129..4ed5f455f41 100644
--- a/python/runtime/mlir/py_register_dialects.h
+++ b/python/runtime/mlir/py_register_dialects.h
@@ -8,10 +8,8 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
 
 namespace cudaq {
-void bindRegisterDialects(py::module &mod);
+void bindRegisterDialects(nanobind::module_ &mod);
 } // namespace cudaq
diff --git a/python/tests/interop/CMakeLists.txt b/python/tests/interop/CMakeLists.txt
index 989549f7bd7..8e921aa6001 100644
--- a/python/tests/interop/CMakeLists.txt
+++ b/python/tests/interop/CMakeLists.txt
@@ -17,7 +17,8 @@ set(CMAKE_INSTALL_RPATH
 
 add_subdirectory(quantum_lib)
 
-pybind11_add_module(cudaq_test_cpp_algo
+nanobind_add_module(cudaq_test_cpp_algo
+  NB_STATIC
   test_cpp_quantum_algorithm_module.cpp
 )
 
@@ -28,6 +29,10 @@ target_link_libraries(cudaq_test_cpp_algo
     cudaq-python-interop
 )
 
+if(APPLE)
+  target_link_options(cudaq_test_cpp_algo PRIVATE -Wl,-undefined,dynamic_lookup)
+endif()
+
 target_include_directories(cudaq_test_cpp_algo
   PRIVATE
     ${CMAKE_SOURCE_DIR}/python
diff --git a/python/tests/interop/quantum_lib/CMakeLists.txt b/python/tests/interop/quantum_lib/CMakeLists.txt
index 6e64f778694..4ec95bc4ba4 100644
--- a/python/tests/interop/quantum_lib/CMakeLists.txt
+++ b/python/tests/interop/quantum_lib/CMakeLists.txt
@@ -20,7 +20,6 @@ add_library(quantum_lib
 target_include_directories(quantum_lib
   PRIVATE
     ${PYTHON_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIRS}
 )
 
 # Dependencies: quantum_lib uses nvq++ as its compiler, so we need the full
diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp
index c93e3d59bf0..f150ee3a4ac 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.cpp
+++ b/python/tests/interop/quantum_lib/quantum_lib.cpp
@@ -8,8 +8,6 @@
 
 #include "quantum_lib.h"
 
-namespace py = pybind11;
-
 __qpu__ void
 cudaq::entryPoint(const std::function<void(cudaq::qvector<> &)> &statePrep) {
   cudaq::qvector q(2);
diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h
index 0fe62c24e9b..81f95c06d1c 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.h
+++ b/python/tests/interop/quantum_lib/quantum_lib.h
@@ -9,7 +9,6 @@
 #pragma once
 
 #include "cudaq/qis/qubit_qis.h"
-#include <pybind11/pybind11.h>
 
 namespace cudaq {
 
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index 50e918e7c8f..f098e35d824 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -10,12 +10,10 @@
 #include "cudaq/algorithms/sample.h"
 #include "quantum_lib/quantum_lib.h"
 #include "runtime/interop/PythonCppInterop.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/vector.h>
 
-namespace py = pybind11;
-
-PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
+NB_MODULE(cudaq_test_cpp_algo, m) {
   // Example of how to expose C++ kernels.
   cudaq::python::addDeviceKernelInterop<cudaq::qview<>>(
       m, "qstd", "qft", "(Fake) Quantum Fourier Transform.");
@@ -27,14 +25,14 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
   // Callback tests
   m.def(
       "run0",
-      [](py::object qern, std::size_t qnum) {
+      [](nanobind::object qern, std::size_t qnum) {
         cudaq::python::launch_specialized_py_decorator<cudaq::qkernel<void()>>(
             qern, cudaq::sit_and_spin_test, qnum);
       },
       "");
   m.def(
       "run0b",
-      [](py::object qern, std::size_t qnum) {
+      [](nanobind::object qern, std::size_t qnum) {
         // This idiom uses argument marshaling instead of specialization. This
         // allows `entryPoint` to be called with different arguments. Note that
         // the `decorator` must remain alive for `entryPoint` to be valid.
@@ -47,14 +45,14 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
       "");
   m.def(
       "run1",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<cudaq::qkernel<void()>>(
             qern, cudaq::plug_and_chug_test);
       },
       "");
   m.def(
       "run2",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<void(cudaq::qvector<> &)>>(qern,
                                                       cudaq::brain_bend_test);
@@ -62,7 +60,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
       "");
   m.def(
       "run3",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<void(cudaq::qvector<> &, std::size_t)>>(
             qern, cudaq::most_curious_test);
@@ -70,7 +68,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
       "");
   m.def(
       "run4",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<std::size_t(cudaq::qvector<> &, std::size_t)>>(
             qern, cudaq::callback_test);
@@ -79,7 +77,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def(
       "run5",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<std::vector<float>()>>(qern, cudaq::py_ret_test1);
       },
@@ -87,7 +85,7 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def(
       "run6",
-      [](py::object qern) {
+      [](nanobind::object qern) {
         cudaq::python::launch_specialized_py_decorator<
             cudaq::qkernel<std::vector<float>(std::size_t)>>(
             qern, cudaq::py_ret_test2);
diff --git a/python/utils/NanobindAdaptors.h b/python/utils/NanobindAdaptors.h
new file mode 100644
index 00000000000..343dde098b7
--- /dev/null
+++ b/python/utils/NanobindAdaptors.h
@@ -0,0 +1,472 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+
+#include "mlir-c/Bindings/Python/Interop.h"
+#include "mlir-c/IR.h"
+
+#include "llvm/ADT/Twine.h"
+
+// Type casters for MLIR C-API types.
+namespace nanobind {
+namespace detail {
+
+/// Helper to convert a presumed MLIR API object to a capsule, accepting either
+/// an explicit Capsule or indirectly by querying the MLIR_PYTHON_CAPI_PTR_ATTR
+/// attribute.
+static nanobind::object mlirApiObjectToCapsule(nanobind::handle apiObject) {
+  if (PyCapsule_CheckExact(apiObject.ptr()))
+    return nanobind::borrow<nanobind::object>(apiObject);
+  if (!nanobind::hasattr(apiObject, MLIR_PYTHON_CAPI_PTR_ATTR)) {
+    auto repr = nanobind::repr(apiObject);
+    throw nanobind::type_error((llvm::Twine("Expected an MLIR object (got ") +
+                                std::string(nanobind::str(repr).c_str()) + ").")
+                                   .str()
+                                   .c_str());
+  }
+  return apiObject.attr(MLIR_PYTHON_CAPI_PTR_ATTR);
+}
+
+/// Casts object <-> MlirAffineMap.
+template <>
+struct type_caster<MlirAffineMap> {
+  NB_TYPE_CASTER(MlirAffineMap, const_name("MlirAffineMap"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToAffineMap(capsule.ptr());
+      return !mlirAffineMapIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirAffineMap v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonAffineMapToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("AffineMap")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirAttribute.
+template <>
+struct type_caster<MlirAttribute> {
+  NB_TYPE_CASTER(MlirAttribute, const_name("MlirAttribute"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToAttribute(capsule.ptr());
+      return !mlirAttributeIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirAttribute v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonAttributeToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Attribute")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object -> MlirContext.
+template <>
+struct type_caster<MlirContext> {
+  NB_TYPE_CASTER(MlirContext, const_name("MlirContext"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      if (src.is_none()) {
+        src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                  .attr("Context")
+                  .attr("current");
+      }
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToContext(capsule.ptr());
+      return !mlirContextIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+};
+
+/// Casts object <-> MlirDialectRegistry.
+template <>
+struct type_caster<MlirDialectRegistry> {
+  NB_TYPE_CASTER(MlirDialectRegistry, const_name("MlirDialectRegistry"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToDialectRegistry(capsule.ptr());
+      return !mlirDialectRegistryIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirDialectRegistry v, rv_policy,
+                         cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule = nanobind::steal<nanobind::object>(
+          mlirPythonDialectRegistryToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("DialectRegistry")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirLocation.
+template <>
+struct type_caster<MlirLocation> {
+  NB_TYPE_CASTER(MlirLocation, const_name("MlirLocation"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      if (src.is_none()) {
+        src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                  .attr("Location")
+                  .attr("current");
+      }
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToLocation(capsule.ptr());
+      return !mlirLocationIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirLocation v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonLocationToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Location")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirModule.
+template <>
+struct type_caster<MlirModule> {
+  NB_TYPE_CASTER(MlirModule, const_name("MlirModule"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToModule(capsule.ptr());
+      return !mlirModuleIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirModule v, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonModuleToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Module")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirOperation.
+template <>
+struct type_caster<MlirOperation> {
+  NB_TYPE_CASTER(MlirOperation, const_name("MlirOperation"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToOperation(capsule.ptr());
+      return !mlirOperationIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirOperation v, rv_policy, cleanup_list *) noexcept {
+    if (v.ptr == nullptr)
+      return nanobind::none().release();
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonOperationToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Operation")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object <-> MlirValue.
+template <>
+struct type_caster<MlirValue> {
+  NB_TYPE_CASTER(MlirValue, const_name("MlirValue"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToValue(capsule.ptr());
+      return !mlirValueIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirValue v, rv_policy, cleanup_list *) noexcept {
+    if (v.ptr == nullptr)
+      return nanobind::none().release();
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonValueToCapsule(v));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Value")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+/// Casts object -> MlirPassManager.
+template <>
+struct type_caster<MlirPassManager> {
+  NB_TYPE_CASTER(MlirPassManager, const_name("MlirPassManager"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToPassManager(capsule.ptr());
+      return !mlirPassManagerIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+};
+
+/// Casts object <-> MlirType.
+template <>
+struct type_caster<MlirType> {
+  NB_TYPE_CASTER(MlirType, const_name("MlirType"))
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    try {
+      nanobind::object capsule = mlirApiObjectToCapsule(src);
+      value = mlirPythonCapsuleToType(capsule.ptr());
+      return !mlirTypeIsNull(value);
+    } catch (...) {
+      return false;
+    }
+  }
+  static handle from_cpp(MlirType t, rv_policy, cleanup_list *) noexcept {
+    try {
+      nanobind::object capsule =
+          nanobind::steal<nanobind::object>(mlirPythonTypeToCapsule(t));
+      return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr("Type")
+          .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
+          .release();
+    } catch (...) {
+      return handle();
+    }
+  }
+};
+
+} // namespace detail
+} // namespace nanobind
+
+namespace mlir {
+namespace python {
+namespace nanobind_adaptors {
+
+/// Provides a facility like nanobind::class_ for defining a new class in a
+/// scope, but this allows extension of an arbitrary Python class, defining
+/// methods on it in a similar way. These are not "real" nanobind classes but
+/// pure Python classes with no relation to a concrete C++ class.
+class pure_subclass {
+public:
+  pure_subclass(nanobind::handle scope, const char *derivedClassName,
+                const nanobind::object &superClass) {
+    nanobind::object pyType =
+        nanobind::borrow<nanobind::object>((PyObject *)&PyType_Type);
+    nanobind::object metaclass = pyType(superClass);
+    nanobind::dict attributes;
+
+    thisClass = metaclass(derivedClassName, nanobind::make_tuple(superClass),
+                          attributes);
+    scope.attr(derivedClassName) = thisClass;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def(const char *name, Func &&f, const Extra &...extra) {
+    nanobind::object cf(
+        nanobind::cpp_function(std::forward<Func>(f), nanobind::name(name),
+                               nanobind::arg("self"), extra...));
+    thisClass.attr(name) = cf;
+    return *this;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def_property_readonly(const char *name, Func &&f,
+                                       const Extra &...extra) {
+    nanobind::object cf(
+        nanobind::cpp_function(std::forward<Func>(f), nanobind::name(name),
+                               nanobind::arg("self"), extra...));
+    auto builtinProperty =
+        nanobind::borrow<nanobind::object>((PyObject *)&PyProperty_Type);
+    thisClass.attr(name) = builtinProperty(cf);
+    return *this;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def_staticmethod(const char *name, Func &&f,
+                                  const Extra &...extra) {
+    nanobind::object cf(nanobind::cpp_function(std::forward<Func>(f),
+                                               nanobind::name(name), extra...));
+    thisClass.attr(name) = nanobind::steal(PyStaticMethod_New(cf.ptr()));
+    return *this;
+  }
+
+  template <typename Func, typename... Extra>
+  pure_subclass &def_classmethod(const char *name, Func &&f,
+                                 const Extra &...extra) {
+    nanobind::object cf(
+        nanobind::cpp_function(std::forward<Func>(f), nanobind::name(name),
+                               nanobind::scope(thisClass), extra...));
+    thisClass.attr(name) =
+        nanobind::steal<nanobind::object>(PyClassMethod_New(cf.ptr()));
+    return *this;
+  }
+
+  nanobind::object get_class() const { return thisClass; }
+
+protected:
+  nanobind::object superClass;
+  nanobind::object thisClass;
+};
+
+/// Creates a custom subclass of mlir.ir.Type, implementing a casting
+/// constructor and type checking methods.
+class mlir_type_subclass : public pure_subclass {
+public:
+  using IsAFunctionTy = bool (*)(MlirType);
+
+  /// Subclasses by looking up the super-class dynamically.
+  mlir_type_subclass(nanobind::handle scope, const char *typeClassName,
+                     IsAFunctionTy isaFunction)
+      : mlir_type_subclass(
+            scope, typeClassName, isaFunction,
+            nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                .attr("Type")) {}
+
+  /// Subclasses with a provided mlir.ir.Type super-class.
+  mlir_type_subclass(nanobind::handle scope, const char *typeClassName,
+                     IsAFunctionTy isaFunction,
+                     const nanobind::object &superCls)
+      : pure_subclass(scope, typeClassName, superCls) {
+    std::string captureTypeName(typeClassName);
+    nanobind::object newCf(nanobind::cpp_function(
+        [superCls, isaFunction, captureTypeName](nanobind::object cls,
+                                                 nanobind::object otherType) {
+          MlirType rawType = nanobind::cast<MlirType>(otherType);
+          if (!isaFunction(rawType)) {
+            auto origRepr =
+                std::string(nanobind::str(nanobind::repr(otherType)).c_str());
+            throw std::invalid_argument((llvm::Twine("Cannot cast type to ") +
+                                         captureTypeName + " (from " +
+                                         origRepr + ")")
+                                            .str());
+          }
+          nanobind::object self = superCls.attr("__new__")(cls, otherType);
+          return self;
+        },
+        nanobind::name("__new__"), nanobind::arg("cls"),
+        nanobind::arg("cast_from_type")));
+    thisClass.attr("__new__") = newCf;
+
+    // 'isinstance' method.
+    def_staticmethod(
+        "isinstance",
+        [isaFunction](MlirType other) { return isaFunction(other); },
+        nanobind::arg("other_type"));
+  }
+};
+
+/// Creates a custom subclass of mlir.ir.Attribute, implementing a casting
+/// constructor and type checking methods.
+class mlir_attribute_subclass : public pure_subclass {
+public:
+  using IsAFunctionTy = bool (*)(MlirAttribute);
+
+  /// Subclasses by looking up the super-class dynamically.
+  mlir_attribute_subclass(nanobind::handle scope, const char *attrClassName,
+                          IsAFunctionTy isaFunction)
+      : mlir_attribute_subclass(
+            scope, attrClassName, isaFunction,
+            nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+                .attr("Attribute")) {}
+
+  /// Subclasses with a provided mlir.ir.Attribute super-class.
+  mlir_attribute_subclass(nanobind::handle scope, const char *typeClassName,
+                          IsAFunctionTy isaFunction,
+                          const nanobind::object &superCls)
+      : pure_subclass(scope, typeClassName, superCls) {
+    std::string captureTypeName(typeClassName);
+    nanobind::object newCf(nanobind::cpp_function(
+        [superCls, isaFunction, captureTypeName](
+            nanobind::object cls, nanobind::object otherAttribute) {
+          MlirAttribute rawAttribute =
+              nanobind::cast<MlirAttribute>(otherAttribute);
+          if (!isaFunction(rawAttribute)) {
+            auto origRepr = std::string(
+                nanobind::str(nanobind::repr(otherAttribute)).c_str());
+            throw std::invalid_argument(
+                (llvm::Twine("Cannot cast attribute to ") + captureTypeName +
+                 " (from " + origRepr + ")")
+                    .str());
+          }
+          nanobind::object self = superCls.attr("__new__")(cls, otherAttribute);
+          return self;
+        },
+        nanobind::name("__new__"), nanobind::arg("cls"),
+        nanobind::arg("cast_from_attr")));
+    thisClass.attr("__new__") = newCf;
+
+    // 'isinstance' method.
+    def_staticmethod(
+        "isinstance",
+        [isaFunction](MlirAttribute other) { return isaFunction(other); },
+        nanobind::arg("other_attribute"));
+  }
+};
+
+} // namespace nanobind_adaptors
+} // namespace python
+} // namespace mlir
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index 6f7849a7dfe..3180b1a52e0 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -27,10 +27,9 @@
 #include <complex>
 #include <functional>
 #include <future>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
+#include <nanobind/stl/string.h>
 
 namespace cudaq {
 
@@ -42,7 +41,7 @@ class OpaqueArguments;
 /// argument types. Future work should make this function perform more checks,
 /// we probably want to take the kernel MLIR argument types as input and use
 /// that to validate that the passed arguments are good to go.
-py::args simplifiedValidateInputArguments(py::args &args);
+nanobind::args simplifiedValidateInputArguments(nanobind::args &args);
 
 /// @brief Search the given Module for the function with provided name.
 template <bool noThrow = false>
@@ -76,24 +75,26 @@ mlir::func::FuncOp getKernelFuncOp(MlirModule module,
 }
 
 template <typename T>
-void checkArgumentType(py::handle arg, int index, const std::string &word) {
+void checkArgumentType(nanobind::handle arg, int index,
+                       const std::string &word) {
   if (!py_ext::isConvertible<T>(arg)) {
     throw std::runtime_error(
         "kernel argument" + word + " type is '" +
         std::string(py_ext::typeName<T>()) + "'" +
         " but argument provided is not (argument " + std::to_string(index) +
-        ", value=" + py::str(arg).cast<std::string>() +
-        ", type=" + py::str(py::type::of(arg)).cast<std::string>() + ").");
+        ", value=" + nanobind::cast<std::string>(nanobind::str(arg)) +
+        ", type=" + nanobind::cast<std::string>(nanobind::str(arg.type())) +
+        ").");
   }
 }
 
 template <typename T>
-void checkArgumentType(py::handle arg, int index) {
+void checkArgumentType(nanobind::handle arg, int index) {
   checkArgumentType<T>(arg, index, "");
 }
 
 template <typename T>
-void checkListElementType(py::handle arg, int index) {
+void checkListElementType(nanobind::handle arg, int index) {
   checkArgumentType<T>(arg, index, "'s element");
 }
 
@@ -114,33 +115,34 @@ std::string mlirTypeToString(mlir::Type ty);
 /// For the current struct member variable type, insert the value into the
 /// dynamically constructed struct.
 void handleStructMemberVariable(void *data, std::size_t offset,
-                                mlir::Type memberType, py::object value);
+                                mlir::Type memberType, nanobind::object value);
 
 /// For the current vector element type, insert the value into the dynamically
 /// constructed vector.
-void *handleVectorElements(mlir::Type eleTy, py::list list);
+void *handleVectorElements(mlir::Type eleTy, nanobind::list list);
 
 /// Take a list of python objects (the arguments) and convert them to C++
 /// objects on the heap. The results are returned in \p argData and include
 /// special `deletors` so that the argument data is cleaned up correctly.
-void packArgs(OpaqueArguments &argData, py::list args,
+void packArgs(OpaqueArguments &argData, nanobind::list args,
               mlir::ArrayRef<mlir::Type> mlirTys,
-              const std::function<bool(OpaqueArguments &, py::object &,
+              const std::function<bool(OpaqueArguments &, nanobind::object &,
                                        unsigned)> &backupHandler,
               mlir::func::FuncOp kernelFuncOp);
 
 /// This overload handles dropping the front \p startingArgIdx arguments on the
 /// floor. They are not packed in \p argData and are simply ignored.
-void packArgs(OpaqueArguments &argData, py::args args,
+void packArgs(OpaqueArguments &argData, nanobind::args args,
               mlir::func::FuncOp kernelFuncOp,
-              const std::function<bool(OpaqueArguments &, py::object &,
+              const std::function<bool(OpaqueArguments &, nanobind::object &,
                                        unsigned)> &backupHandler,
               std::size_t startingArgIdx = 0);
 
 /// Return `true` if the given \p args represents a request for broadcasting
 /// sample or observe over all argument sets. \p args types can be `int`,
 /// `float`, `list`, so must check if `args[i]` is a `list` or `ndarray`.
-inline bool isBroadcastRequest(kernel_builder<> &builder, py::args &args) {
+inline bool isBroadcastRequest(kernel_builder<> &builder,
+                               nanobind::args &args) {
   // FIXME: The use of isArgStdVec in this function inhibits moving this code
   // out of the header file.
   if (args.empty())
@@ -148,14 +150,14 @@ inline bool isBroadcastRequest(kernel_builder<> &builder, py::args &args) {
 
   auto arg = args[0];
   // Just need to check the leading argument
-  if (py::isinstance<py::list>(arg) && !builder.isArgStdVec(0))
+  if (nanobind::isinstance<nanobind::list>(arg) && !builder.isArgStdVec(0))
     return true;
 
-  if (py::hasattr(arg, "tolist")) {
-    if (!py::hasattr(arg, "shape"))
+  if (nanobind::hasattr(arg, "tolist")) {
+    if (!nanobind::hasattr(arg, "shape"))
       return false;
 
-    auto shape = arg.attr("shape").cast<py::tuple>();
+    auto shape = nanobind::cast<nanobind::tuple>(arg.attr("shape"));
     if (shape.size() == 1 && !builder.isArgStdVec(0))
       return true;
 
diff --git a/python/utils/PyTypes.h b/python/utils/PyTypes.h
index 1872e5228be..6bba9f02fee 100644
--- a/python/utils/PyTypes.h
+++ b/python/utils/PyTypes.h
@@ -9,22 +9,29 @@
 #pragma once
 
 #include <complex>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/complex.h>
 
 namespace py_ext {
 
 /// Extended python complex object.
 ///
 /// Includes `complex`, `numpy.complex64`, `numpy.complex128`.
-class Complex : public pybind11::object {
+class Complex : public nanobind::object {
 public:
-  PYBIND11_OBJECT_CVT(Complex, object, isComplex_, convert_)
+  NB_OBJECT_DEFAULT(Complex, object, "complex", isComplex_)
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Complex(const nanobind::object &o)
+      : object(nanobind::steal(convert_(o.ptr()))) {
+    if (!m_ptr)
+      throw nanobind::python_error();
+  }
 
   Complex(double real, double imag)
-      : object(PyComplex_FromDoubles(real, imag), stolen_t{}) {
+      : object(nanobind::steal(PyComplex_FromDoubles(real, imag))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate complex object!");
+      throw std::runtime_error("Could not allocate complex object!");
     }
   }
 
@@ -67,7 +74,7 @@ class Complex : public pybind11::object {
       double imag = PyComplex_ImagAsDouble(o);
       ret = PyComplex_FromDoubles(real, imag);
     } else {
-      pybind11::set_error(PyExc_TypeError, "Unexpected type");
+      PyErr_SetString(PyExc_TypeError, "Unexpected type");
     }
     return ret;
   }
@@ -76,22 +83,31 @@ class Complex : public pybind11::object {
 /// Extended python float object.
 ///
 /// Includes `float`, `numpy.float64`, `numpy.float32`.
-class Float : public pybind11::object {
+class Float : public nanobind::object {
 public:
-  PYBIND11_OBJECT_CVT(Float, object, isFloat_, convert_)
+  NB_OBJECT_DEFAULT(Float, object, "float", isFloat_)
+
+  // Converting constructor
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Float(const nanobind::object &o)
+      : object(nanobind::steal(convert_(o.ptr()))) {
+    if (!m_ptr)
+      throw nanobind::python_error();
+  }
 
   // Allow implicit conversion from float/double:
   // NOLINTNEXTLINE(google-explicit-constructor)
-  Float(float value) : object(PyFloat_FromDouble((double)value), stolen_t{}) {
+  Float(float value)
+      : object(nanobind::steal(PyFloat_FromDouble((double)value))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate float object!");
+      throw std::runtime_error("Could not allocate float object!");
     }
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
   Float(double value = .0)
-      : object(PyFloat_FromDouble((double)value), stolen_t{}) {
+      : object(nanobind::steal(PyFloat_FromDouble((double)value))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate float object!");
+      throw std::runtime_error("Could not allocate float object!");
     }
   }
   // NOLINTNEXTLINE(google-explicit-constructor)
@@ -116,7 +132,7 @@ class Float : public pybind11::object {
     if (isFloat_(o)) {
       ret = PyFloat_FromDouble(PyFloat_AsDouble(o));
     } else {
-      pybind11::set_error(PyExc_TypeError, "Unexpected type");
+      PyErr_SetString(PyExc_TypeError, "Unexpected type");
     }
     return ret;
   }
@@ -125,15 +141,22 @@ class Float : public pybind11::object {
 /// Extended python int object.
 ///
 /// Includes `int`, `numpy.intXXX`.
-class Int : public pybind11::object {
+class Int : public nanobind::object {
 public:
-  PYBIND11_OBJECT_CVT(Int, object, isInt_, convert_)
+  NB_OBJECT_DEFAULT(Int, object, "int", isInt_)
+
+  // Converting constructor
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Int(const nanobind::object &o) : object(nanobind::steal(convert_(o.ptr()))) {
+    if (!m_ptr)
+      throw nanobind::python_error();
+  }
 
   // Allow implicit conversion from int:
   // NOLINTNEXTLINE(google-explicit-constructor)
-  Int(long value) : object(PyLong_FromLong((long)value), stolen_t{}) {
+  Int(long value) : object(nanobind::steal(PyLong_FromLong((long)value))) {
     if (!m_ptr) {
-      pybind11::pybind11_fail("Could not allocate float object!");
+      throw std::runtime_error("Could not allocate int object!");
     }
   }
 
@@ -164,7 +187,7 @@ class Int : public pybind11::object {
     if (isInt_(o)) {
       ret = PyLong_FromLong(PyLong_AsLong(o));
     } else {
-      pybind11::set_error(PyExc_TypeError, "Unexpected type");
+      PyErr_SetString(PyExc_TypeError, "Unexpected type");
     }
     return ret;
   }
@@ -187,79 +210,79 @@ inline char const *typeName<py_ext::Int>() {
   return "long";
 }
 template <>
-inline char const *typeName<pybind11::int_>() {
+inline char const *typeName<nanobind::int_>() {
   return "long";
 }
 template <>
-inline char const *typeName<pybind11::bool_>() {
+inline char const *typeName<nanobind::bool_>() {
   return "bool";
 }
 template <>
-inline char const *typeName<pybind11::list>() {
+inline char const *typeName<nanobind::list>() {
   return "list";
 }
 
-template <typename T, pybind11::detail::enable_if_t<
-                          std::is_base_of<pybind11::object, T>::value, int> = 0>
-inline bool isConvertible(pybind11::handle o) {
-  return pybind11::isinstance<T>(o);
+template <typename T, std::enable_if_t<
+                          std::is_base_of<nanobind::object, T>::value, int> = 0>
+inline bool isConvertible(nanobind::handle o) {
+  return nanobind::isinstance<T>(o);
 }
 template <>
-inline bool isConvertible<Complex>(pybind11::handle o) {
-  return pybind11::isinstance<Complex>(o) || pybind11::isinstance<Float>(o) ||
-         pybind11::isinstance<pybind11::int_>(o);
+inline bool isConvertible<Complex>(nanobind::handle o) {
+  return nanobind::isinstance<Complex>(o) || nanobind::isinstance<Float>(o) ||
+         nanobind::isinstance<nanobind::int_>(o);
 }
 template <>
-inline bool isConvertible<Float>(pybind11::handle o) {
-  return pybind11::isinstance<Float>(o) ||
-         pybind11::isinstance<pybind11::int_>(o);
+inline bool isConvertible<Float>(nanobind::handle o) {
+  return nanobind::isinstance<Float>(o) ||
+         nanobind::isinstance<nanobind::int_>(o);
 }
 
 template <typename T>
-inline pybind11::object convert(T value) = delete;
+inline nanobind::object convert(T value) = delete;
 
 template <>
-inline pybind11::object convert(bool value) {
-  return pybind11::bool_(value);
+inline nanobind::object convert(bool value) {
+  return nanobind::bool_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int8_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int8_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int16_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int16_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int32_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int32_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(std::int64_t value) {
-  return pybind11::int_(value);
+inline nanobind::object convert(std::int64_t value) {
+  return nanobind::int_(value);
 }
 
 template <>
-inline pybind11::object convert(float value) {
+inline nanobind::object convert(float value) {
   return Float(value);
 }
 
 template <>
-inline pybind11::object convert(double value) {
+inline nanobind::object convert(double value) {
   return Float(value);
 }
 
 template <>
-inline pybind11::object convert(std::complex<float> value) {
+inline nanobind::object convert(std::complex<float> value) {
   return Complex(value);
 }
 
 template <>
-inline pybind11::object convert(std::complex<double> value) {
+inline nanobind::object convert(std::complex<double> value) {
   return Complex(value);
 }
 
diff --git a/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h b/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h
index e2cb36ee650..aa44bc77aa3 100644
--- a/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h
+++ b/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h
@@ -51,7 +51,7 @@ class Global : public GlobalParams {
 
   Global(RTBox, Pobj, Pgrad, GlobalParams);
 
-  virtual ~Global(){};
+  virtual ~Global() {};
 
   //  Global& operator=(const Global &);
 
diff --git a/runtime/cudaq/operators/matrix.cpp b/runtime/cudaq/operators/matrix.cpp
index 7f38431aac3..cb7e6e1cd11 100644
--- a/runtime/cudaq/operators/matrix.cpp
+++ b/runtime/cudaq/operators/matrix.cpp
@@ -168,7 +168,8 @@ cudaq::complex_matrix::operator*=(const cudaq::complex_matrix &right) {
   if (cols() != right.rows())
     throw std::runtime_error("matrix dimensions mismatch in operator*=");
 
-  auto new_data = new cudaq::complex_matrix::value_type[rows() * right.cols()];
+  auto new_data =
+      new cudaq::complex_matrix::value_type[rows() * right.cols()]();
   cudaq::complex_matrix::Dimensions new_dims = {rows(), right.cols()};
   for (std::size_t i = 0; i < rows(); i++)
     for (std::size_t j = 0; j < right.cols(); j++)
@@ -364,7 +365,7 @@ cudaq::complex_matrix cudaq::complex_matrix::exponential() {
   std::size_t columns = this->cols();
   if (rows != columns)
     throw std::runtime_error("Matrix exponential expects a square matrix.");
-  auto result = cudaq::complex_matrix(rows, columns, false);
+  auto result = cudaq::complex_matrix(rows, columns);
   // Taylor Series Approximation, fixed at 20 steps.
   std::size_t taylor_steps = 20;
   for (std::size_t step = 0; step < taylor_steps; step++) {
diff --git a/runtime/cudaq/ptsbe/PTSBEExecutionData.h b/runtime/cudaq/ptsbe/PTSBEExecutionData.h
index 5b0582b0e44..7aadf871389 100644
--- a/runtime/cudaq/ptsbe/PTSBEExecutionData.h
+++ b/runtime/cudaq/ptsbe/PTSBEExecutionData.h
@@ -40,7 +40,7 @@ enum class TraceInstructionType {
 /// @brief Single operation in the PTSBE execution trace.
 ///
 /// Stores gate, noise channel, or measurement info with plain qubit indices.
-/// This is the user-facing trace type exposed to Python via pybind11.
+/// This is the user-facing trace type exposed to Python via nanobind.
 ///
 struct TraceInstruction {
   /// @brief Instruction category (Gate, Noise, or Measurement)
diff --git a/runtime/cudaq/qis/pauli_word.h b/runtime/cudaq/qis/pauli_word.h
index c7eaf611ba1..6ef277eb339 100644
--- a/runtime/cudaq/qis/pauli_word.h
+++ b/runtime/cudaq/qis/pauli_word.h
@@ -12,6 +12,7 @@
 #include <cstdint>
 #include <ctype.h>
 #include <string>
+#include <vector>
 
 namespace cudaq {
 
diff --git a/tpls/nanobind b/tpls/nanobind
new file mode 160000
index 00000000000..2a61ad2494d
--- /dev/null
+++ b/tpls/nanobind
@@ -0,0 +1 @@
+Subproject commit 2a61ad2494d09fecb2e13322c1383342c299900d

From 1a24c665e7efe39285ad8f3b9a3516551eb15fd2 Mon Sep 17 00:00:00 2001
From: Harshit <harshit.11235@gmail.com>
Date: Thu, 16 Apr 2026 14:30:08 +0000
Subject: [PATCH 12/85] fix: docs for qbraid helper and update examples

---
 .github/pre-commit/spelling_allowlist.txt     |   4 +-
 docs/sphinx/targets/cpp/qbraid.cpp            |   8 +-
 docs/sphinx/targets/python/qbraid.py          |   8 +-
 docs/sphinx/using/backends/cloud.rst          |   2 +-
 docs/sphinx/using/backends/cloud/qbraid.rst   | 102 ++++++++++++------
 .../using/backends/hardware/iontrap.rst       |  63 -----------
 .../{test_Qbraid.py => test_qbraid.py}        |   3 +-
 .../helpers/qbraid/QbraidServerHelper.cpp     |   8 +-
 8 files changed, 87 insertions(+), 111 deletions(-)
 rename python/tests/backends/{test_Qbraid.py => test_qbraid.py} (99%)

diff --git a/.github/pre-commit/spelling_allowlist.txt b/.github/pre-commit/spelling_allowlist.txt
index 984c5e929c8..12d96e4e884 100644
--- a/.github/pre-commit/spelling_allowlist.txt
+++ b/.github/pre-commit/spelling_allowlist.txt
@@ -1,7 +1,6 @@
 ABI
 AFQMC
 API
-api
 APIs
 AST
 Aer
@@ -125,7 +124,6 @@ QRMI
 QTX
 QX
 QaaS
-Qbraid
 Qiskit
 QuEra
 QuTiP
@@ -172,6 +170,7 @@ amongst
 ancilla
 ansatz
 ansatzes
+api
 archiver
 arity
 auxillary
@@ -338,6 +337,7 @@ preprocessor
 probability
 programmatically
 pybind
+qBraid
 qaoa
 qbraid
 qed
diff --git a/docs/sphinx/targets/cpp/qbraid.cpp b/docs/sphinx/targets/cpp/qbraid.cpp
index b6d859ec2de..f7a15a0906e 100644
--- a/docs/sphinx/targets/cpp/qbraid.cpp
+++ b/docs/sphinx/targets/cpp/qbraid.cpp
@@ -2,12 +2,12 @@
 // ```
 // nvq++ --target qbraid qbraid.cpp -o out.x && ./out.x
 // ```
-// This will submit the job to the Qbraid ideal simulator target (default).
+// This will submit the job to the qBraid ideal simulator target (default).
 
 #include <cudaq.h>
 #include <fstream>
 
-// Define a simple quantum kernel to execute on Qbraid.
+// Define a simple quantum kernel to execute on qBraid.
 struct ghz {
   // Maximally entangled state between 5 qubits.
   auto operator()() __qpu__ {
@@ -21,7 +21,7 @@ struct ghz {
 };
 
 int main() {
-  // Submit to Qbraid asynchronously (e.g., continue executing
+  // Submit to qBraid asynchronously (e.g., continue executing
   // code in the file until the job has been returned).
   auto future = cudaq::sample_async(ghz{});
   // ... classical code to execute in the meantime ...
@@ -41,7 +41,7 @@ int main() {
   auto async_counts = readIn.get();
   async_counts.dump();
 
-  // OR: Submit to Qbraid synchronously (e.g., wait for the job
+  // OR: Submit to qBraid synchronously (e.g., wait for the job
   // result to be returned before proceeding).
   auto counts = cudaq::sample(ghz{});
   counts.dump();
diff --git a/docs/sphinx/targets/python/qbraid.py b/docs/sphinx/targets/python/qbraid.py
index cf3fe483c6b..dc61d605709 100644
--- a/docs/sphinx/targets/python/qbraid.py
+++ b/docs/sphinx/targets/python/qbraid.py
@@ -7,7 +7,7 @@
 cudaq.set_target("qbraid")
 
 
-# Create the kernel we'd like to execute on Qbraid.
+# Create the kernel we'd like to execute on qBraid.
 @cudaq.kernel
 def kernel():
     qvector = cudaq.qvector(2)
@@ -15,12 +15,12 @@ def kernel():
     x.ctrl(qvector[0], qvector[1])
 
 
-# Execute on Qbraid and print out the results.
+# Execute on qBraid and print out the results.
 
 # Option A:
 # By using the asynchronous `cudaq.sample_async`, the remaining
 # classical code will be executed while the job is being handled
-# by IonQ. This is ideal when submitting via a queue over
+# by qBraid. This is ideal when submitting via a queue over
 # the cloud.
 async_results = cudaq.sample_async(kernel)
 # ... more classical code to run ...
@@ -46,6 +46,6 @@ def kernel():
 # Option B:
 # By using the synchronous `cudaq.sample`, the execution of
 # any remaining classical code in the file will occur only
-# after the job has been returned from Qbraid.
+# after the job has been returned from qBraid.
 counts = cudaq.sample(kernel)
 print(counts)
diff --git a/docs/sphinx/using/backends/cloud.rst b/docs/sphinx/using/backends/cloud.rst
index d2044d64e9e..ebd02e033e8 100644
--- a/docs/sphinx/using/backends/cloud.rst
+++ b/docs/sphinx/using/backends/cloud.rst
@@ -8,4 +8,4 @@ CUDA-Q provides a number of options to access hardware resources (GPUs and QPUs)
 
         Amazon Braket (braket) <cloud/braket.rst>
         Scaleway QaaS (scaleway) <cloud/scaleway.rst>
-        Qbraid <cloud/qbraid.rst>
+        qBraid <cloud/qbraid.rst>
diff --git a/docs/sphinx/using/backends/cloud/qbraid.rst b/docs/sphinx/using/backends/cloud/qbraid.rst
index a7e7fe4a2ae..dfa72e53913 100644
--- a/docs/sphinx/using/backends/cloud/qbraid.rst
+++ b/docs/sphinx/using/backends/cloud/qbraid.rst
@@ -1,61 +1,101 @@
-QBRAID
-+++++++
+qBraid
+++++++
 
 .. _qbraid-backend:
 
+`qBraid <https://www.qbraid.com/>`__ is a cloud platform that brokers access to
+quantum simulators and hardware from multiple vendors through a single API.
+CUDA-Q can submit OpenQASM 2 jobs to any device exposed by the qBraid service.
+See the `qBraid device catalog <https://account.qbraid.com/devices>`__ for the
+set of simulators and QPUs currently available.
+
 Setting Credentials
-`````````````````````````
+```````````````````
 
-Programmers of CUDA-Q may access the `Qbraid Devices
-<https://account.qbraid.com//>`__ from either C++ or Python. Generate
-an API key from your `Qbraid account <https://account.qbraid.com//>`__ and export
-it as an environment variable:
+Generate an API key from your `qBraid account <https://account.qbraid.com/>`__
+and export it as an environment variable:
 
 .. code:: bash
 
-  export QBRAID_API_KEY="qbraid_generated_api_key"
+    export QBRAID_API_KEY="qbraid_generated_api_key"
+
+Alternatively, the API key can be passed directly to ``cudaq.set_target`` via
+the ``api_key`` argument (see below).
+
+Submitting
+``````````
+
+.. tab:: Python
+
+    The target to which quantum kernels are submitted can be controlled with
+    the ``cudaq.set_target()`` function.
+
+    .. code:: python
+
+        cudaq.set_target("qbraid")
+
+    By default, jobs are submitted to the qBraid state vector simulator
+    (``qbraid:qbraid:sim:qir-sv``).
 
+    To specify a different qBraid device, set the ``machine`` parameter to its
+    qBraid device ID.
 
-Submission from Python
-`````````````````````````
+    .. code:: python
+
+        cudaq.set_target("qbraid", machine="qbraid:qbraid:sim:qir-sv")
 
-    First, set the :code:`qbraid` backend.
+    The API key can also be supplied inline instead of through the
+    ``QBRAID_API_KEY`` environment variable.
 
     .. code:: python
 
-        cudaq.set_target('qbraid')
+        cudaq.set_target("qbraid", api_key="qbraid_generated_api_key")
 
-    By default, quantum kernel code will be submitted to the IonQ simulator on qBraid.
+    qBraid devices are cloud-hosted, so local emulation via the ``emulate``
+    flag is not supported — all jobs are executed on the qBraid service.
+    To run without submitting to real hardware, select one of the qBraid
+    simulator devices (for example, ``qbraid:qbraid:sim:qir-sv``) via the
+    ``machine`` argument.
 
-   To emulate the qbraid's simulator locally, without submitting through the cloud, you can also set the ``emulate`` flag to ``True``. This will emit any target specific compiler diagnostics.
+    The number of shots for a kernel execution can be set through the
+    ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. The
+    default is 1000.
 
-   .. code:: python
+    .. code:: python
 
-       cudaq.set_target('qbraid', emulate=True)
+        cudaq.sample(kernel, shots_count=10000)
 
-   The number of shots for a kernel execution can be set through the ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. By default, the ``shots_count`` is set to 1000.
+.. tab:: C++
 
-   .. code:: python
+    To target quantum kernel code for execution on qBraid, pass the flag
+    ``--target qbraid`` to the ``nvq++`` compiler. By default jobs are
+    submitted to the qBraid state vector simulator
+    (``qbraid:qbraid:sim:qir-sv``).
 
-       cudaq.sample(kernel, shots_count=10000)
+    .. code:: bash
 
-   To see a complete example for using Qbraid's backends, take a look at our :doc:`Python examples <../../examples/examples>`.
+        nvq++ --target qbraid src.cpp
 
-Submission from C++
-`````````````````````````
-        To target quantum kernel code for execution using qbraid,
-        pass the flag ``--target qbraid`` to the ``nvq++`` compiler.
+    To execute kernels on a different device, pass ``--qbraid-machine`` with
+    the qBraid device ID:
 
-        .. code:: bash
+    .. code:: bash
 
-                nvq++ --target qbraid src.cpp
+        nvq++ --target qbraid --qbraid-machine "qbraid:qbraid:sim:qir-sv" src.cpp
 
-        This will take the API key and handle all authentication with, and submission to, the Qbraid device. By default, quantum kernel code will be submitted to the Qbraidsimulator.
+    The API key can be passed explicitly with ``--qbraid-api_key`` instead of
+    being read from ``QBRAID_API_KEY``:
 
-        To emulate the qbraid's machine locally, without submitting through the cloud, you can also pass the ``--emulate`` flag to ``nvq++``. This will emit any target  specific compiler diagnostics, before running a noise free emulation.
+    .. code:: bash
 
-        .. code:: bash
+        nvq++ --target qbraid --qbraid-api_key "qbraid_generated_api_key" src.cpp
 
-                nvq++ --emulate --target qbraid src.cpp
+    qBraid devices are cloud-hosted, so the ``--emulate`` flag is not
+    supported for this target — all jobs are executed on the qBraid
+    service. To run without submitting to real hardware, pass
+    ``--qbraid-machine`` with a qBraid simulator device ID (for example,
+    ``qbraid:qbraid:sim:qir-sv``).
 
-        To see a complete example for using IonQ's backends, take a look at our :doc:`C++ examples <../../examples/examples>`.
+To see a complete example for using qBraid's backends, take a look at our
+:doc:`Python examples <../../examples/examples>` and
+:doc:`C++ examples <../../examples/examples>`.
diff --git a/docs/sphinx/using/backends/hardware/iontrap.rst b/docs/sphinx/using/backends/hardware/iontrap.rst
index 0dc69de2177..160ab9f549f 100644
--- a/docs/sphinx/using/backends/hardware/iontrap.rst
+++ b/docs/sphinx/using/backends/hardware/iontrap.rst
@@ -245,66 +245,3 @@ To see a complete example, take a look at :ref:`Quantinuum examples <quantinuum-
 
         Any environment variables must be set prior to setting the target or running "`import cudaq`".
 
-QBRAID
-+++++++
-
-.. _qbraid-backend:
-
-Setting Credentials
-`````````````````````````
-
-Programmers of CUDA-Q may access the `Qbraid Devices
-<https://account.qbraid.com//>`__ from either C++ or Python. Generate
-an API key from your `Qbraid account <https://account.qbraid.com//>`__ and export
-it as an environment variable:
-
-.. code:: bash
-
-  export QBRAID_API_KEY="qbraid_generated_api_key"
-
-
-Submitting
-`````````````````````````
-.. tab:: Python
-
-    First, set the :code:`qbraid` backend.
-
-    .. code:: python
-
-        cudaq.set_target('qbraid')
-
-    By default, quantum kernel code will be submitted to the IonQ simulator on qBraid.
-
-   To emulate the qbraid's simulator locally, without submitting through the cloud, you can also set the ``emulate`` flag to ``True``. This will emit any target specific compiler diagnostics.
-
-   .. code:: python
-
-       cudaq.set_target('qbraid', emulate=True)
-
-   The number of shots for a kernel execution can be set through the ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. By default, the ``shots_count`` is set to 1000.
-
-   .. code:: python
-
-       cudaq.sample(kernel, shots_count=10000)
-
-   To see a complete example for using Qbraid's backends, take a look at our :doc:`Python examples <../../examples/examples>`.
-
-
-.. tab:: C++
-
-        To target quantum kernel code for execution using qbraid,
-        pass the flag ``--target qbraid`` to the ``nvq++`` compiler.
-
-        .. code:: bash
-
-                nvq++ --target qbraid src.cpp
-
-        This will take the API key and handle all authentication with, and submission to, the Qbraid device. By default, quantum kernel code will be submitted to the Qbraidsimulator.
-
-        To emulate the qbraid's machine locally, without submitting through the cloud, you can also pass the ``--emulate`` flag to ``nvq++``. This will emit any target  specific compiler diagnostics, before running a noise free emulation.
-
-        .. code:: bash
-
-                nvq++ --emulate --target qbraid src.cpp
-
-        To see a complete example for using IonQ's backends, take a look at our :doc:`C++ examples <../../examples/examples>`.
diff --git a/python/tests/backends/test_Qbraid.py b/python/tests/backends/test_qbraid.py
similarity index 99%
rename from python/tests/backends/test_Qbraid.py
rename to python/tests/backends/test_qbraid.py
index 6b6d1599753..c42cc3e1a02 100644
--- a/python/tests/backends/test_Qbraid.py
+++ b/python/tests/backends/test_qbraid.py
@@ -23,10 +23,9 @@
 port = 62452
 
 # Default machine for tests. Mirrors the real qBraid device string format.
-TEST_MACHINE = "ionq:ionq:sim:simulator"
+TEST_MACHINE = "qbraid:qbraid:sim:qir-sv"
 TEST_API_KEY = "00000000000000000000000000000000"
 
-
 # The qbraid mock server in utils/mock_qpu/qbraid/__init__.py doesn't simulate
 # quantum mechanics - it only inspects the QASM for `h` and `measure` ops and
 # generates random outcomes for qubits with H. It does NOT model entanglement
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index b53979fd98e..a9c4e1a8e6d 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -19,14 +19,14 @@ namespace cudaq {
 
 class QbraidServerHelper : public ServerHelper {
   static constexpr const char *DEFAULT_URL = "https://api-v2.qbraid.com/api/v1";
-  static constexpr const char *DEFAULT_DEVICE = "ionq:ionq:sim:simulator";
+  static constexpr const char *DEFAULT_DEVICE = "qbraid:qbraid:sim:qir-sv";
   static constexpr int DEFAULT_QUBITS = 29;
 
 public:
   const std::string name() const override { return "qbraid"; }
 
   void initialize(BackendConfig config) override {
-    cudaq::info("Initializing Qbraid Backend.");
+    cudaq::info("Initializing qBraid Backend.");
 
     backendConfig.clear();
     backendConfig["url"] = getValueOrDefault(config, "url", DEFAULT_URL);
@@ -34,7 +34,7 @@ class QbraidServerHelper : public ServerHelper {
     backendConfig["qubits"] = std::to_string(DEFAULT_QUBITS);
 
     // Accept "machine" as a user-friendly alias for device_id
-    // Usage: cudaq.set_target("qbraid", machine="ionq:ionq:sim:simulator")
+    // Usage: cudaq.set_target("qbraid", machine="qbraid:qbraid:sim:qir-sv")
     if (!config["machine"].empty()) {
       backendConfig["device_id"] = config["machine"];
     } else {
@@ -61,7 +61,7 @@ class QbraidServerHelper : public ServerHelper {
 
     parseConfigForCommonParams(config);
 
-    cudaq::info("Qbraid configuration initialized:");
+    cudaq::info("qBraid configuration initialized:");
     for (const auto &[key, value] : backendConfig) {
       cudaq::info("  {} = {}", key, value);
     }

From 31bde9eb8c7296451e84ce09fa33b434b86920af Mon Sep 17 00:00:00 2001
From: Harshit <harshit.11235@gmail.com>
Date: Fri, 17 Apr 2026 06:24:52 +0000
Subject: [PATCH 13/85] DCO Remediation Commit for Harshit
 <harshit.11235@gmail.com>

I, Harshit <harshit.11235@gmail.com>, hereby add my Signed-off-by to this commit: 9cd62cffc6562aef4c425857cb97edf80c5a3407
I, Harshit <harshit.11235@gmail.com>, hereby add my Signed-off-by to this commit: 3b0a1e4c84bab1378b4b37b279060143794cb21f
I, Harshit <harshit.11235@gmail.com>, hereby add my Signed-off-by to this commit: 1a24c665e7efe39285ad8f3b9a3516551eb15fd2

Signed-off-by: Harshit <harshit.11235@gmail.com>
---
 .../platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index a9c4e1a8e6d..a7f89cbb0b7 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -33,7 +33,7 @@ class QbraidServerHelper : public ServerHelper {
     backendConfig["user_agent"] = "cudaq/" + std::string(cudaq::getVersion());
     backendConfig["qubits"] = std::to_string(DEFAULT_QUBITS);
 
-    // Accept "machine" as a user-friendly alias for device_id
+    // Accept "machine" as a user-friendly alias for qBraid's device_id
     // Usage: cudaq.set_target("qbraid", machine="qbraid:qbraid:sim:qir-sv")
     if (!config["machine"].empty()) {
       backendConfig["device_id"] = config["machine"];

From 61e4b91db34eebf5ab0b25a9cc6fc8e730d85c38 Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Fri, 17 Apr 2026 06:51:14 +0000
Subject: [PATCH 14/85] DCO Remediation Commit for TheGupta2012
 <harshit.11235@gmail.com>

I, TheGupta2012 <harshit.11235@gmail.com>, hereby add my Signed-off-by to this commit: 925ae39eebd02886afd9415a9546b1f74fc65d15
I, TheGupta2012 <harshit.11235@gmail.com>, hereby add my Signed-off-by to this commit: 41fe2486fc9777cd8941ec1d4f4df5da8a6cf389
I, TheGupta2012 <harshit.11235@gmail.com>, hereby add my Signed-off-by to this commit: d74243dd59e94091acae92bedd96ae41332bde68

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 .../platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index a7f89cbb0b7..1b137c03bed 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -254,7 +254,7 @@ class QbraidServerHelper : public ServerHelper {
   //
   // Context: nvq++ emits one `creg varK[1];` per measurement. AWS Braket's
   // classical simulators (SV1, DM1, TN1) tolerate that via lenient register
-  // concatenation, but stricter hardware transpilers reject it:
+  // concatenation, but stricter hardware transpilers below reject it:
   //   - IQM (Garnet etc.): returns only the first register -> 1-bit results
   //   - Rigetti: collapses all registers onto b[0] -> "bit already in use"
   //   - IonQ-via-Braket: similar strict behavior

From 66a1e2d0d89de20ebbe68b84e9ba80bd5da96529 Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Fri, 17 Apr 2026 13:07:00 +0200
Subject: [PATCH 15/85] Add stateless CompiledModuleHelper (#4338)

This is a rewrite of #4329, using a stateless class with static
functions rather than a builder pattern.

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 python/extension/CMakeLists.txt               |  1 +
 runtime/common/CompiledModule.cpp             | 24 +-----
 runtime/common/CompiledModule.h               | 47 ++++-------
 runtime/cudaq/platform/default/python/QPU.cpp | 18 ++--
 runtime/internal/compiler/CMakeLists.txt      |  1 +
 .../compiler/CompiledModuleHelper.cpp         | 83 +++++++++++++++++++
 runtime/internal/compiler/JIT.cpp             | 18 ----
 .../compiler/CompiledModuleHelper.h           | 74 +++++++++++++++++
 .../include/cudaq_internal/compiler/JIT.h     |  8 --
 9 files changed, 187 insertions(+), 87 deletions(-)
 create mode 100644 runtime/internal/compiler/CompiledModuleHelper.cpp
 create mode 100644 runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h

diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index dac02d47fca..d5bad6f9ac1 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -129,6 +129,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../../runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
     ../../runtime/cudaq/platform/default/python/QPU.cpp
     ../../runtime/internal/compiler/ArgumentConversion.cpp
+    ../../runtime/internal/compiler/CompiledModuleHelper.cpp
     ../../runtime/internal/compiler/LayoutInfo.cpp
     ../../runtime/internal/compiler/RuntimeMLIR.cpp
     ../../runtime/internal/compiler/RuntimePyMLIR.cpp
diff --git a/runtime/common/CompiledModule.cpp b/runtime/common/CompiledModule.cpp
index e0fabfa2480..cefbb5473fd 100644
--- a/runtime/common/CompiledModule.cpp
+++ b/runtime/common/CompiledModule.cpp
@@ -7,15 +7,11 @@
  ******************************************************************************/
 
 #include "CompiledModule.h"
-#include "cudaq/Optimizer/Builder/RuntimeNames.h"
 #include <memory>
 #include <stdexcept>
 
-using namespace cudaq_internal::compiler;
-
-cudaq::CompiledModule::CompiledModule(std::string kernelName,
-                                      ResultInfo resultInfo)
-    : name(std::move(kernelName)), resultInfo(std::move(resultInfo)) {}
+cudaq::CompiledModule::CompiledModule(std::string kernelName)
+    : name(std::move(kernelName)) {}
 
 const cudaq::CompiledModule::JitArtifact &
 cudaq::CompiledModule::getJit() const {
@@ -104,19 +100,3 @@ void (*cudaq::CompiledModule::JitArtifact::getEntryPoint() const)() {
 cudaq::JitEngine cudaq::CompiledModule::JitArtifact::getEngine() const {
   return engine;
 }
-
-void cudaq::CompiledModule::attachJit(JitEngine engine,
-                                      bool isFullySpecialized) {
-  bool hasResult = resultInfo.hasResult();
-  std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
-  std::string entryName =
-      (hasResult || !isFullySpecialized) ? name + ".thunk" : fullName;
-  void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
-  int64_t (*argsCreator)(const void *, void **) = nullptr;
-  if (!isFullySpecialized)
-    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
-        engine.lookupRawNameOrFail(name + ".argsCreator"));
-
-  addArtifact(name, JitArtifact{std::move(engine), entryPoint, argsCreator,
-                                std::nullopt});
-}
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index df170960107..9837e81af26 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -20,25 +20,17 @@
 #include <vector>
 
 // This header file and the types defined within are designed to have no
-// dependencies and be useable across the compiler and runtime. However,
-// constructing instances of these types is easiest done within compilation
-// units that do link against MLIR. We provide this functionality via free
-// functions, defined as friends of the types defined here and implemented in
-// the `cudaq-mlir-runtime` library.
+// dependencies and be useable across the compiler and runtime. Constructing
+// `CompiledModule` is supported through
+// `cudaq_internal::compiler::CompiledModuleHelper`, available in
+// `CompiledModuleHelper.h` from `cudaq-mlir-runtime`.
 
 namespace mlir {
-class Type;
-class ModuleOp;
 class ExecutionEngine;
 } // namespace mlir
 
-namespace cudaq {
-class ResultInfo;
-} // namespace cudaq
-
 namespace cudaq_internal::compiler {
-cudaq::ResultInfo createResultInfo(mlir::Type resultType, bool isEntryPoint,
-                                   mlir::ModuleOp module);
+class CompiledModuleHelper;
 } // namespace cudaq_internal::compiler
 
 namespace cudaq {
@@ -73,12 +65,9 @@ class JitEngine {
 };
 
 /// Pre-computed result metadata, set at build time. Used at execution time
-/// for result buffer allocation and type conversion. Construct via
-/// `createResultInfo` (implemented in `cudaq-mlir-runtime`).
+/// for result buffer allocation and type conversion.
 class ResultInfo {
-  // Friend factory function, to be used for construction.
-  friend cudaq::ResultInfo cudaq_internal::compiler::createResultInfo(
-      mlir::Type resultType, bool isEntryPoint, mlir::ModuleOp module);
+  friend class cudaq_internal::compiler::CompiledModuleHelper;
   friend class CompiledModule;
 
   /// Opaque pointer to the `mlir::Type` of the result. Obtained via
@@ -106,8 +95,8 @@ class ResultInfo {
 /// of a Quake MLIR module.
 ///
 /// This type does not depend on MLIR/LLVM — it only keeps type-erased / opaque
-/// pointers. Use the `attachJit` member function to attach JIT-compiled
-/// artifacts after construction.
+/// pointers. Build instances with
+/// `cudaq_internal::compiler::CompiledModuleHelper`.
 class CompiledModule {
 public:
   // --- Compiled artifact types ---
@@ -126,6 +115,7 @@ class CompiledModule {
           resourceCounts(std::move(resourceCounts)) {}
 
     friend class CompiledModule;
+    friend class cudaq_internal::compiler::CompiledModuleHelper;
 
   public:
     // TODO: remove the following two methods once the `CompiledModule` instance
@@ -167,17 +157,6 @@ class CompiledModule {
   /// A compiled artifact is either a JIT binary or an MLIR module.
   using CompiledArtifact = std::variant<JitArtifact, MlirArtifact>;
 
-  // --- Construction ---
-
-  CompiledModule(std::string kernelName, ResultInfo resultInfo);
-
-  /// @brief Populate the JIT representation of a `CompiledModule`.
-  ///
-  /// Resolves the entry point and (optionally) `argsCreator` symbols from the
-  /// engine, using the kernel's name and result metadata to determine the
-  /// correct mangled symbol names.
-  void attachJit(JitEngine engine, bool isFullySpecialized);
-
   // --- Queries ---
 
   /// Whether any artifact in the map is a JitArtifact.
@@ -222,7 +201,11 @@ class CompiledModule {
   KernelThunkResultType execute(const std::vector<void *> &rawArgs) const;
 
 private:
-  /// Add a compiled artifact to the kernel.
+  friend class cudaq_internal::compiler::CompiledModuleHelper;
+
+  CompiledModule(std::string kernelName);
+
+  /// Add a compiled artifact to the module under the given name.
   void addArtifact(std::string name, CompiledArtifact artifact);
 
   std::string name;
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index a70ae4ac377..559d0360892 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -23,6 +23,7 @@
 #include "cudaq/Verifier/QIRLLVMIRDialect.h"
 #include "cudaq/platform.h"
 #include "cudaq_internal/compiler/ArgumentConversion.h"
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include "cudaq_internal/compiler/JIT.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
@@ -359,12 +360,14 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
         varArgIndices.clear();
     }
     const bool isFullySpecialized = varArgIndices.empty();
-    auto resultInfo = createResultInfo(resultTy, isEntryPoint, module);
+    auto resultInfo =
+        CompiledModuleHelper::createResultInfo(resultTy, isEntryPoint, module);
 
     if (auto jit = alreadyBuiltJITCode(name, rawArgs)) {
-      cudaq::CompiledModule ck(name, resultInfo);
-      ck.attachJit(*jit, isFullySpecialized);
-      return ck;
+      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
+          name, *jit, resultInfo, isFullySpecialized);
+      return CompiledModuleHelper::createCompiledModule(name, resultInfo,
+                                                        jitArtifacts);
     }
 
     // 1. Check that this call is sane.
@@ -404,9 +407,10 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     cudaq::compiler_artifact::saveArtifact(name, rawArgs, jit,
                                            argsCreatorThunk);
 
-    cudaq::CompiledModule ck(name, resultInfo);
-    ck.attachJit(jit, isFullySpecialized);
-    return ck;
+    auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
+        name, jit, resultInfo, isFullySpecialized);
+    return CompiledModuleHelper::createCompiledModule(
+        name, std::move(resultInfo), jitArtifacts);
   }
 };
 } // namespace
diff --git a/runtime/internal/compiler/CMakeLists.txt b/runtime/internal/compiler/CMakeLists.txt
index 662f003a542..5af9675271c 100644
--- a/runtime/internal/compiler/CMakeLists.txt
+++ b/runtime/internal/compiler/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(cudaq-mlir-runtime
   SHARED
     ArgumentConversion.cpp
     Compiler.cpp
+    CompiledModuleHelper.cpp
     JIT.cpp
     RuntimeMLIR.cpp
     RuntimeCppMLIR.cpp
diff --git a/runtime/internal/compiler/CompiledModuleHelper.cpp b/runtime/internal/compiler/CompiledModuleHelper.cpp
new file mode 100644
index 00000000000..ec3a227aa4a
--- /dev/null
+++ b/runtime/internal/compiler/CompiledModuleHelper.cpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
+#include "cudaq/Optimizer/Builder/RuntimeNames.h"
+#include "cudaq_internal/compiler/LayoutInfo.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Types.h"
+
+using namespace mlir;
+
+namespace cudaq_internal::compiler {
+
+cudaq::ResultInfo CompiledModuleHelper::createResultInfo(Type resultTy,
+                                                         bool isEntryPoint,
+                                                         ModuleOp module) {
+  cudaq::ResultInfo info;
+  if (!resultTy || !isEntryPoint)
+    return info;
+
+  info.typeOpaquePtr = resultTy.getAsOpaquePointer();
+  auto [size, offsets] = getResultBufferLayout(module, resultTy);
+  info.bufferSize = size;
+  info.fieldOffsets = std::move(offsets);
+  return info;
+}
+
+std::vector<CompiledModuleHelper::NamedJitArtifact>
+CompiledModuleHelper::createJitArtifacts(const std::string &kernelName,
+                                         cudaq::JitEngine engine,
+                                         const cudaq::ResultInfo &resultInfo,
+                                         bool isFullySpecialized) {
+  bool hasResult = resultInfo.hasResult();
+  std::string fullName =
+      std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName;
+  std::string entryName =
+      (hasResult || !isFullySpecialized) ? kernelName + ".thunk" : fullName;
+  void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
+  int64_t (*argsCreator)(const void *, void **) = nullptr;
+  if (!isFullySpecialized)
+    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
+        engine.lookupRawNameOrFail(kernelName + ".argsCreator"));
+
+  std::vector<NamedJitArtifact> artifacts;
+  artifacts.emplace_back(kernelName, cudaq::CompiledModule::JitArtifact{
+                                         std::move(engine), entryPoint,
+                                         argsCreator, std::nullopt});
+  return artifacts;
+}
+
+cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
+    std::string name, cudaq::ResultInfo resultInfo,
+    std::vector<NamedJitArtifact> jitArtifacts) {
+  return createCompiledModule(std::move(name), std::move(resultInfo),
+                              std::move(jitArtifacts), {});
+}
+
+cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
+    std::string name, cudaq::ResultInfo resultInfo,
+    std::vector<NamedMlirArtifact> mlirArtifacts) {
+  return createCompiledModule(std::move(name), std::move(resultInfo), {},
+                              std::move(mlirArtifacts));
+}
+
+cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
+    std::string name, cudaq::ResultInfo resultInfo,
+    std::vector<NamedJitArtifact> jitArtifacts,
+    std::vector<NamedMlirArtifact> mlirArtifacts) {
+  cudaq::CompiledModule compiled(std::move(name));
+  compiled.resultInfo = std::move(resultInfo);
+  for (auto &[artName, artifact] : jitArtifacts)
+    compiled.addArtifact(std::move(artName), std::move(artifact));
+  for (auto &[artName, artifact] : mlirArtifacts)
+    compiled.addArtifact(std::move(artName), std::move(artifact));
+  return compiled;
+}
+
+} // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/JIT.cpp b/runtime/internal/compiler/JIT.cpp
index 9696c94403c..90b783cb69f 100644
--- a/runtime/internal/compiler/JIT.cpp
+++ b/runtime/internal/compiler/JIT.cpp
@@ -19,7 +19,6 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Verifier/QIRLLVMIRDialect.h"
 #include "cudaq/runtime/logger/logger.h"
-#include "cudaq_internal/compiler/LayoutInfo.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
@@ -340,23 +339,6 @@ cudaq_internal::compiler::createJITEngine(ModuleOp &moduleOp,
   return JitEngine(std::move(jitOrError.get()));
 }
 
-/// Build a `ResultInfo` from an MLIR return type.
-/// \p resultTy may be null (no return value). When \p isEntryPoint is false,
-/// the result is not marshaled — returns an empty `ResultInfo`.
-cudaq::ResultInfo cudaq_internal::compiler::createResultInfo(Type resultTy,
-                                                             bool isEntryPoint,
-                                                             ModuleOp module) {
-  cudaq::ResultInfo info;
-  if (!resultTy || !isEntryPoint)
-    return info;
-
-  info.typeOpaquePtr = resultTy.getAsOpaquePointer();
-  auto [size, offsets] = getResultBufferLayout(module, resultTy);
-  info.bufferSize = size;
-  info.fieldOffsets = std::move(offsets);
-  return info;
-}
-
 class cudaq::JitEngine::Impl : public cudaq::JitEngine::Base {
 public:
   Impl(std::unique_ptr<ExecutionEngine> jitEngine)
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
new file mode 100644
index 00000000000..d979fcb0535
--- /dev/null
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
@@ -0,0 +1,74 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include "common/CompiledModule.h"
+
+namespace mlir {
+class Type;
+class ModuleOp;
+} // namespace mlir
+
+namespace cudaq_internal::compiler {
+
+/// Compiler-side helper for `cudaq::CompiledModule`: static factory methods and
+/// utilities that depend on MLIR but pair with the MLIR-free `CompiledModule`
+/// API in `common/CompiledModule.h`.
+class CompiledModuleHelper {
+public:
+  // --- Named artifact aliases ---
+
+  using NamedJitArtifact =
+      std::pair<std::string, cudaq::CompiledModule::JitArtifact>;
+  using NamedMlirArtifact =
+      std::pair<std::string, cudaq::CompiledModule::MlirArtifact>;
+
+  CompiledModuleHelper() = delete;
+
+  // --- ResultInfo construction ---
+
+  /// Create a `ResultInfo` from MLIR type metadata.
+  ///
+  /// When \p resultType is null or \p isEntryPoint is false, returns an empty
+  /// `ResultInfo` (no marshaled return value).
+  static cudaq::ResultInfo createResultInfo(mlir::Type resultType,
+                                            bool isEntryPoint,
+                                            mlir::ModuleOp module);
+
+  // --- JitArtifact construction ---
+
+  /// Construct named JitArtifacts from the compiled functions in the JIT
+  /// engine.
+  ///
+  /// Uses the kernel's name and result metadata to determine the correct
+  /// mangled symbol names. Returns one named artifact per resolved symbol.
+  static std::vector<NamedJitArtifact>
+  createJitArtifacts(const std::string &kernelName, cudaq::JitEngine engine,
+                     const cudaq::ResultInfo &resultInfo,
+                     bool isFullySpecialized);
+
+  // --- CompiledModule construction ---
+
+  /// Create a `CompiledModule` containing only JIT artifacts.
+  static cudaq::CompiledModule
+  createCompiledModule(std::string name, cudaq::ResultInfo resultInfo,
+                       std::vector<NamedJitArtifact> jitArtifacts);
+
+  /// Create a `CompiledModule` containing only MLIR artifacts.
+  static cudaq::CompiledModule
+  createCompiledModule(std::string name, cudaq::ResultInfo resultInfo,
+                       std::vector<NamedMlirArtifact> mlirArtifacts);
+
+  /// Create a `CompiledModule` containing both JIT and MLIR artifacts.
+  static cudaq::CompiledModule
+  createCompiledModule(std::string name, cudaq::ResultInfo resultInfo,
+                       std::vector<NamedJitArtifact> jitArtifacts,
+                       std::vector<NamedMlirArtifact> mlirArtifacts);
+};
+
+} // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h b/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
index f886a4f912c..54db0cbaffe 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
@@ -28,7 +28,6 @@ class Type;
 
 namespace cudaq {
 class CompiledModule;
-class ResultInfo;
 } // namespace cudaq
 
 namespace cudaq_internal::compiler {
@@ -46,11 +45,4 @@ createWrappedKernel(std::string_view llvmIr, const std::string &kernelName,
 cudaq::JitEngine createJITEngine(mlir::ModuleOp &moduleOp,
                                  llvm::StringRef convertTo);
 
-/// @brief Create a `ResultInfo` from MLIR type and module.
-///
-/// When `resultType` is null or `isEntryPoint` is false, returns an empty
-/// `ResultInfo`.
-cudaq::ResultInfo createResultInfo(mlir::Type resultType, bool isEntryPoint,
-                                   mlir::ModuleOp module);
-
 } // namespace cudaq_internal::compiler

From ac40cc27af30b417f2581704dec05af1bc5b9a86 Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Fri, 17 Apr 2026 14:18:42 -0700
Subject: [PATCH 16/85] removing email (#4344)

Fixes #4343.

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 .claude/skills/cudaq-guide/SKILL.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.claude/skills/cudaq-guide/SKILL.md b/.claude/skills/cudaq-guide/SKILL.md
index 9dda4831f1d..c7b210e928f 100644
--- a/.claude/skills/cudaq-guide/SKILL.md
+++ b/.claude/skills/cudaq-guide/SKILL.md
@@ -3,13 +3,13 @@ name: "cudaq-guide"
 title: "Cuda Quantum"
 description: "CUDA-Q onboarding guide for installation, test programs, GPU simulation, QPU hardware, and quantum applications."
 version: "1.0.0"
-author: "Sachin Pisal <spisal@nvidia.com>"
+author: "CUDA-Q"
 tags: [cuda-quantum, quantum-computing, onboarding, getting-started, nvidia]
 tools: [Read, Glob, Grep, Bash]
 license: "Apache License 2.0"
 compatibility: "Python 3.10+, C++ 20"
 metadata:
-    author: "Sachin Pisal <spisal@nvidia.com>"
+    author: "CUDA-Q"
     tags:
         - cuda-quantum
         - quantum-computing

From 42352731802813c6ae2a66657e09c02d6695eb4d Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Sat, 18 Apr 2026 02:43:06 +1000
Subject: [PATCH 17/85] Add `eliminate-dead-heap-copy` pass for early `malloc`
 elimination (#4335)

When a kernel returns a vector (for `cudaq::run`), we insert
`__nvqpp_vectorCopyCtor` which performs a `malloc` + `memcpy` to copy
stack data to the heap. After `AggressiveInlining` and
`ReturnToOutputLog`, the heap copy becomes dead but remains in the IR.
This is normally cleaned up by LLVM's optimization passes, but on code
paths that emit MLIR directly (e.g., `nop` for backends that consume
`quake`), these dead allocations persist and get sent to the server.

This PR adds a new MLIR pass, `eliminate-dead-heap-copy`, that redirects
reads from the `malloc`'d buffer to the original `memcpy` source (the
stack `alloca`), then erases the dead `malloc`, `memcpy`, and
`cc.stdvec_init` ops.

This can be added on-demand via target yml file. Update the mock server
test to demonstrate that.

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 include/cudaq/Optimizer/CodeGen/Passes.td     |  28 ++++
 lib/Optimizer/CodeGen/CMakeLists.txt          |   1 +
 .../CodeGen/EliminateDeadHeapCopy.cpp         | 109 +++++++++++++++
 test/Transforms/eliminate_dead_heap_copy.qke  | 126 ++++++++++++++++++
 .../QuakeStartServerAndTest.sh.in             |   9 ++
 .../backends/quake_backend/mock_server.py     |  11 +-
 .../backends/quake_backend/quake_fake.yml     |   1 +
 unittests/backends/quake_backend/test_app.py  |  95 +++++++++++++
 8 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
 create mode 100644 test/Transforms/eliminate_dead_heap_copy.qke
 create mode 100644 unittests/backends/quake_backend/test_app.py

diff --git a/include/cudaq/Optimizer/CodeGen/Passes.td b/include/cudaq/Optimizer/CodeGen/Passes.td
index 0424599a46a..598e9d2c7c0 100644
--- a/include/cudaq/Optimizer/CodeGen/Passes.td
+++ b/include/cudaq/Optimizer/CodeGen/Passes.td
@@ -278,5 +278,33 @@ def ReturnToOutputLog : Pass<"return-to-output-log", "mlir::ModuleOp"> {
   ];
 }
 
+def EliminateDeadHeapCopy
+    : Pass<"eliminate-dead-heap-copy", "mlir::func::FuncOp"> {
+  let summary = "Eliminate dead heap copies from return value logging.";
+  let description = [{
+    When a kernel returns a vector (e.g., measurement results), the frontend
+    wraps the return value with `__nvqpp_vectorCopyCtor`, which performs a
+    malloc+memcpy to copy the data from the callee's stack to the heap. This
+    is necessary because the returned data must outlive the callee's stack
+    frame when one kernel calls another. After AggressiveInlining, this
+    intrinsic is expanded into raw malloc and memcpy operations in the caller.
+
+    After ReturnToOutputLog converts return values to QIR output logging
+    calls (e.g., `__quantum__rt__*_record_output`), it reads from the
+    cc.stdvec_init's buffer (the malloc'd pointer) and creates new load ops
+    from it, leaving the cc.stdvec_init with no users. The malloc+memcpy
+    are then only needed to populate the heap buffer that the output logging
+    reads from. This pass redirects those reads to the memcpy source (the
+    original stack data), making the malloc+memcpy dead, and erases them
+    along with the now-unused cc.stdvec_init.
+
+    Note: this pass is only needed on code paths that do not run LLVM's
+    optimization passes (e.g., when emitting MLIR rather than LLVM IR for 
+    a remote backend). When the full LLVM opt pipeline runs, it would eliminate
+    these dead allocations on its own.
+  }];
+  let dependentDialects = ["cudaq::cc::CCDialect", "mlir::func::FuncDialect"];
+}
+
 
 #endif // CUDAQ_OPT_OPTIMIZER_CODEGEN_PASSES
diff --git a/lib/Optimizer/CodeGen/CMakeLists.txt b/lib/Optimizer/CodeGen/CMakeLists.txt
index d64b2b32fd9..d6036b56e60 100644
--- a/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -21,6 +21,7 @@ add_cudaq_library(OptCodeGen
   ConvertToQIR.cpp
   ConvertToQIRAPI.cpp
   DelayMeasurements.cpp
+  EliminateDeadHeapCopy.cpp
   OptUtils.cpp
   Passes.cpp
   Pipelines.cpp
diff --git a/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp b/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
new file mode 100644
index 00000000000..981a45b0f48
--- /dev/null
+++ b/lib/Optimizer/CodeGen/EliminateDeadHeapCopy.cpp
@@ -0,0 +1,109 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "PassDetails.h"
+#include "cudaq/Optimizer/CodeGen/Passes.h"
+#include "cudaq/Optimizer/Dialect/CC/CCOps.h"
+
+#define DEBUG_TYPE "eliminate-dead-heap-copy"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_ELIMINATEDEADHEAPCOPY
+#include "cudaq/Optimizer/CodeGen/Passes.h.inc"
+} // namespace cudaq::opt
+
+using namespace mlir;
+
+namespace {
+
+/// When a kernel returns a vector, the frontend copies the stack data to the
+/// heap via malloc+memcpy (from __nvqpp_vectorCopyCtor) so the data outlives
+/// the callee's stack frame. After inlining and ReturnToOutputLog, the output
+/// logging reads from the heap buffer through cc.cast ops, and the
+/// cc.stdvec_init that wrapped the malloc becomes dead. This pass redirects
+/// those cc.cast reads to the memcpy source (the original stack buffer) and
+/// erases the now-dead malloc, memcpy, and cc.stdvec_init.
+struct EliminateDeadHeapCopyPass
+    : public cudaq::opt::impl::EliminateDeadHeapCopyBase<
+          EliminateDeadHeapCopyPass> {
+  using EliminateDeadHeapCopyBase::EliminateDeadHeapCopyBase;
+
+  void runOnOperation() override {
+    auto func = getOperation();
+    SmallVector<func::CallOp> mallocCalls;
+    func.walk([&](func::CallOp callOp) {
+      if (callOp.getCallee() == "malloc")
+        mallocCalls.push_back(callOp);
+    });
+
+    for (auto mallocCall : mallocCalls) {
+      // malloc should return exactly one result (the allocated pointer).
+      if (mallocCall->getNumResults() != 1)
+        continue;
+      Value mallocResult = mallocCall.getResult(0);
+
+      // Classify users of the malloc result.
+      func::CallOp memcpyCall;
+      SmallVector<cudaq::cc::StdvecInitOp> deadVecInits;
+      SmallVector<cudaq::cc::CastOp> castUsers;
+      bool hasUnsafeUser = false;
+
+      for (auto *user : mallocResult.getUsers()) {
+        if (auto userCall = dyn_cast<func::CallOp>(user)) {
+          if (userCall.getCallee().starts_with("llvm.memcpy") &&
+              userCall.getOperand(0) == mallocResult) {
+            if (memcpyCall) {
+              // Multiple memcpys to the same malloc dest — bail out.
+              hasUnsafeUser = true;
+              break;
+            }
+            memcpyCall = userCall;
+            continue;
+          }
+        }
+        // A dead stdvec_init (no remaining users) can be safely erased.
+        // One with live users is treated as unsafe.
+        if (auto vecInit = dyn_cast<cudaq::cc::StdvecInitOp>(user)) {
+          if (vecInit->use_empty()) {
+            deadVecInits.push_back(vecInit);
+            continue;
+          }
+        }
+        // A cc.cast is safe to redirect: since the memcpy copies from
+        // source to the malloc buffer, reading through either pointer
+        // yields the same data.
+        if (auto castOp = dyn_cast<cudaq::cc::CastOp>(user)) {
+          castUsers.push_back(castOp);
+          continue;
+        }
+        // Any other user prevents elimination.
+        hasUnsafeUser = true;
+        break;
+      }
+
+      if (!memcpyCall || hasUnsafeUser)
+        continue;
+
+      Value memcpySrc = memcpyCall.getOperand(1);
+
+      // Redirect cc.cast users from the malloc result to the memcpy source.
+      for (auto castOp : castUsers)
+        castOp->replaceUsesOfWith(mallocResult, memcpySrc);
+
+      // Erase dead stdvec_inits.
+      for (auto vecInit : deadVecInits)
+        vecInit->erase();
+
+      // Erase memcpy and malloc.
+      memcpyCall->erase();
+      mallocCall->erase();
+    }
+  }
+};
+
+} // namespace
diff --git a/test/Transforms/eliminate_dead_heap_copy.qke b/test/Transforms/eliminate_dead_heap_copy.qke
new file mode 100644
index 00000000000..0ac14f07e31
--- /dev/null
+++ b/test/Transforms/eliminate_dead_heap_copy.qke
@@ -0,0 +1,126 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt --eliminate-dead-heap-copy %s | FileCheck %s
+
+// After ReturnToOutputLog, the malloc+memcpy used to create a heap copy of
+// stack data for stdvec returns becomes dead. The only remaining users of the
+// malloc result are the memcpy (as dest) and a cc.cast that feeds into
+// record_output calls. This pass should replace the cc.cast's use of the
+// malloc result with the memcpy source and erase the dead malloc+memcpy.
+
+func.func private @malloc(i64) -> !cc.ptr<i8>
+func.func private @llvm.memcpy.p0i8.p0i8.i64(!cc.ptr<i8>, !cc.ptr<i8>, i64, i1)
+func.func private @__quantum__rt__int_record_output(i64, !cc.ptr<i8>)
+func.func private @__quantum__rt__array_record_output(i64, !cc.ptr<i8>)
+
+// Test basic malloc+memcpy elimination where the malloc result is only used
+// by memcpy (as dest) and a cc.cast.
+func.func @test_basic_elimination() {
+  %c40 = arith.constant 40 : i64
+  %c5 = arith.constant 5 : i64
+  %false = arith.constant false
+  %alloca = cc.alloca !cc.array<i64 x 5>
+  %cast_src = cc.cast %alloca : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+  %malloc_res = call @malloc(%c40) : (i64) -> !cc.ptr<i8>
+  call @llvm.memcpy.p0i8.p0i8.i64(%malloc_res, %cast_src, %c40, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  %cast_dst = cc.cast %malloc_res : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+  %ptr0 = cc.compute_ptr %cast_dst[0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+  %val0 = cc.load %ptr0 : !cc.ptr<i64>
+  %label = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+  %label_cast = cc.cast %label : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+  call @__quantum__rt__int_record_output(%val0, %label_cast) : (i64, !cc.ptr<i8>) -> ()
+  return
+}
+
+// CHECK-LABEL:   func.func @test_basic_elimination() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 40 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 5 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant false
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<i64 x 5>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_8:.*]] = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+// CHECK:           call @__quantum__rt__int_record_output(%[[VAL_7]], %[[VAL_9]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           return
+// CHECK:         }
+
+// Test that dead cc.stdvec_init ops are also removed.
+func.func @test_dead_stdvec_init() {
+  %c40 = arith.constant 40 : i64
+  %c5 = arith.constant 5 : i64
+  %false = arith.constant false
+  %alloca = cc.alloca !cc.array<i64 x 5>
+  %cast_src = cc.cast %alloca : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+  %malloc_res = call @malloc(%c40) : (i64) -> !cc.ptr<i8>
+  call @llvm.memcpy.p0i8.p0i8.i64(%malloc_res, %cast_src, %c40, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  %dead_vec = cc.stdvec_init %malloc_res, %c5 : (!cc.ptr<i8>, i64) -> !cc.stdvec<i64>
+  %cast_dst = cc.cast %malloc_res : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+  %ptr0 = cc.compute_ptr %cast_dst[0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+  %val0 = cc.load %ptr0 : !cc.ptr<i64>
+  %label = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+  %label_cast = cc.cast %label : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+  call @__quantum__rt__int_record_output(%val0, %label_cast) : (i64, !cc.ptr<i8>) -> ()
+  return
+}
+
+// CHECK-LABEL:   func.func @test_dead_stdvec_init() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 40 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 5 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant false
+// CHECK:           %[[VAL_3:.*]] = cc.alloca !cc.array<i64 x 5>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_8:.*]] = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+// CHECK:           call @__quantum__rt__int_record_output(%[[VAL_7]], %[[VAL_9]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           return
+// CHECK:         }
+
+// Test that malloc is NOT removed when it has non-memcpy, non-stdvec_init
+// users that cannot be redirected (e.g., another call using it as an argument).
+func.func private @use_ptr(!cc.ptr<i8>)
+
+func.func @test_no_elimination_extra_user() {
+  %c40 = arith.constant 40 : i64
+  %false = arith.constant false
+  %alloca = cc.alloca !cc.array<i64 x 5>
+  %cast_src = cc.cast %alloca : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+  %malloc_res = call @malloc(%c40) : (i64) -> !cc.ptr<i8>
+  call @llvm.memcpy.p0i8.p0i8.i64(%malloc_res, %cast_src, %c40, %false) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+  call @use_ptr(%malloc_res) : (!cc.ptr<i8>) -> ()
+  %cast_dst = cc.cast %malloc_res : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+  %ptr0 = cc.compute_ptr %cast_dst[0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+  %val0 = cc.load %ptr0 : !cc.ptr<i64>
+  %label = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+  %label_cast = cc.cast %label : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+  call @__quantum__rt__int_record_output(%val0, %label_cast) : (i64, !cc.ptr<i8>) -> ()
+  return
+}
+
+// CHECK-LABEL:   func.func @test_no_elimination_extra_user() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 40 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant false
+// CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.array<i64 x 5>
+// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_4:.*]] = call @malloc(%[[VAL_0]]) : (i64) -> !cc.ptr<i8>
+// CHECK:           call @llvm.memcpy.p0i8.p0i8.i64(%[[VAL_4]], %[[VAL_3]], %[[VAL_0]], %[[VAL_1]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64, i1) -> ()
+// CHECK:           call @use_ptr(%[[VAL_4]]) : (!cc.ptr<i8>) -> ()
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.array<i64 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_8:.*]] = cc.string_literal "[0]" : !cc.ptr<!cc.array<i8 x 4>>
+// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x 4>>) -> !cc.ptr<i8>
+// CHECK:           call @__quantum__rt__int_record_output(%[[VAL_7]], %[[VAL_9]]) : (i64, !cc.ptr<i8>) -> ()
+// CHECK:           return
+// CHECK:         }
diff --git a/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in b/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in
index 26617326ebd..989039f1870 100644
--- a/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in
+++ b/unittests/backends/quake_backend/QuakeStartServerAndTest.sh.in
@@ -80,6 +80,15 @@ else
   fi
 fi
 
+# Run the Python test
+PYTHONPATH=@CMAKE_BINARY_DIR@/python @Python_EXECUTABLE@ @CMAKE_SOURCE_DIR@/unittests/backends/quake_backend/test_app.py
+if [ $? -ne 0 ]; then
+  echo ":x: Python test_app.py failed"
+  test_err_sum=$((test_err_sum+1))
+else
+  echo ":white_check_mark: Successfully ran Python test_app.py"
+fi
+
 # kill the server
 kill -INT $pid
 # return success / failure
diff --git a/unittests/backends/quake_backend/mock_server.py b/unittests/backends/quake_backend/mock_server.py
index 3d497e0608d..9adc4456af0 100644
--- a/unittests/backends/quake_backend/mock_server.py
+++ b/unittests/backends/quake_backend/mock_server.py
@@ -9,7 +9,7 @@
 import cudaq
 from fastapi import FastAPI, HTTPException, Header, Request
 from typing import Union
-import uvicorn, uuid, base64, ctypes, sys
+import uvicorn, uuid, base64, ctypes, sys, re
 from pydantic import BaseModel
 from llvmlite import binding as llvm
 from cudaq.mlir.passmanager import PassManager
@@ -39,6 +39,15 @@ async def postJob(request: Request):
     payload = await request.json()
     # Decode base64
     decoded_payload = base64.b64decode(payload["ir"]).decode('utf-8')
+    # Verify that the input MLIR does not contain actual `malloc` or `memcpy` calls.
+    # Match `@malloc` or `@llvm.memcpy` as function references (calls or declarations).
+    if re.search(r'@malloc\b', decoded_payload) or \
+       re.search(r'@(llvm\.)?memcpy\b', decoded_payload):
+        raise RuntimeError(
+            "Input MLIR contains malloc or memcpy calls. "
+            "These should have been eliminated by the eliminate-dead-heap-copy pass."
+        )
+
     ctx = getMLIRContext()
     recovered_mod = Module.parse(decoded_payload, context=ctx)
     pm = PassManager.parse(
diff --git a/unittests/backends/quake_backend/quake_fake.yml b/unittests/backends/quake_backend/quake_fake.yml
index c9d9f8a53e6..47da8122955 100644
--- a/unittests/backends/quake_backend/quake_fake.yml
+++ b/unittests/backends/quake_backend/quake_fake.yml
@@ -19,6 +19,7 @@ config:
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the JIT lowering pipeline
   jit-high-level-pipeline: "expand-measurements"
+  jit-low-level-pipeline: "return-to-output-log,func.func(eliminate-dead-heap-copy),symbol-dce"
   # Tell the rest-qpu that we are simply dumping CUDA-Q MLIR code.
   codegen-emission: nop
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/unittests/backends/quake_backend/test_app.py b/unittests/backends/quake_backend/test_app.py
new file mode 100644
index 00000000000..ee55a114a3b
--- /dev/null
+++ b/unittests/backends/quake_backend/test_app.py
@@ -0,0 +1,95 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+import cudaq
+import sys
+
+cudaq.set_target("quake_fake")
+
+qubit_count = 5
+
+
+@cudaq.kernel
+def kernel() -> list[int]:
+    qvector = cudaq.qvector(qubit_count)
+
+    for i in range(qubit_count - 1):
+        h(qvector[i])
+
+    s(qvector[0])
+    r1(math.pi / 2, qvector[1])
+    a = mz(qvector)
+    return a
+
+
+@cudaq.kernel
+def all_zeros() -> list[int]:
+    q = cudaq.qvector(4)
+    return mz(q)
+
+
+@cudaq.kernel
+def all_ones() -> list[int]:
+    q = cudaq.qvector(4)
+    x(q)
+    return mz(q)
+
+
+@cudaq.kernel
+def alternating_01() -> list[int]:
+    q = cudaq.qvector(4)
+    x(q[1])
+    x(q[3])
+    return mz(q)
+
+
+@cudaq.kernel
+def single_qubit_flip() -> list[int]:
+    q = cudaq.qvector(1)
+    x(q[0])
+    return mz(q)
+
+
+try:
+    res = cudaq.run(kernel)
+    assert res is not None
+    assert len(res) > 0
+    assert len(res[0]) == qubit_count
+    for shot in res:
+        for val in shot:
+            assert val in (0, 1)
+
+    # Deterministic: all qubits stay |0>.
+    res = cudaq.run(all_zeros)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [0, 0, 0,
+                              0], f"expected [0,0,0,0], got {list(shot)}"
+
+    # Deterministic: X on all qubits -> all |1>.
+    res = cudaq.run(all_ones)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [1, 1, 1,
+                              1], f"expected [1,1,1,1], got {list(shot)}"
+
+    # Deterministic: X on qubits 1 and 3 -> [0,1,0,1].
+    res = cudaq.run(alternating_01)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [0, 1, 0,
+                              1], f"expected [0,1,0,1], got {list(shot)}"
+
+    # Deterministic: single qubit X -> [1].
+    res = cudaq.run(single_qubit_flip)
+    assert len(res) > 0
+    for shot in res:
+        assert list(shot) == [1], f"expected [1], got {list(shot)}"
+
+except Exception as e:
+    print(e)
+    sys.exit(1)

From 8d7e922dfb0981153530fbdd0e5e566bbce2c53e Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Fri, 17 Apr 2026 21:14:01 -0700
Subject: [PATCH 18/85] Updating cuquantum version to 26.03.1 (#4342)

Updating cuquantum version to 26.03.1

---------

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 docker/build/devcontainer.Dockerfile | 2 +-
 pyproject.toml.cu12                  | 6 +++---
 pyproject.toml.cu13                  | 6 +++---
 scripts/configure_build.sh           | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docker/build/devcontainer.Dockerfile b/docker/build/devcontainer.Dockerfile
index 51fa1c72ea5..4100c90e340 100644
--- a/docker/build/devcontainer.Dockerfile
+++ b/docker/build/devcontainer.Dockerfile
@@ -183,7 +183,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     else \
         cupy_version=13.4.1; \
     fi && \
-    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==${cupy_version} cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==26.1.0 && \
+    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==${cupy_version} cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==26.3.1 && \
     if [ "$(python3 --version | grep -o [0-9\.]* | cut -d . -f -2)" != "3.12" ]; then \
         echo "expecting Python version 3.12"; \
     fi
diff --git a/pyproject.toml.cu12 b/pyproject.toml.cu12
index b86d7743d64..b07ec657757 100644
--- a/pyproject.toml.cu12
+++ b/pyproject.toml.cu12
@@ -19,9 +19,9 @@ license = "Apache-2.0"
 license-files = ["LICENSE", "NOTICE", "CITATION.cff"]
 dependencies = [
   'astpretty ~= 3.0',
-  'custatevec-cu12 ~= 1.12',
-  'cutensornet-cu12 ~= 2.11',
-  'cudensitymat-cu12 ~= 0.4',
+  'custatevec-cu12 ~= 1.13.1',
+  'cutensornet-cu12 ~= 2.12.1',
+  'cudensitymat-cu12 ~= 0.5.1',
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.32.4',
diff --git a/pyproject.toml.cu13 b/pyproject.toml.cu13
index bae9dbbb929..7e9a6dd6926 100644
--- a/pyproject.toml.cu13
+++ b/pyproject.toml.cu13
@@ -23,9 +23,9 @@ dependencies = [
   'scipy >= 1.10.1',
   'requests >= 2.32.4',
   # CUDA dependencies - excluded on macOS (CPU-only support)
-  'custatevec-cu13 ~= 1.12; sys_platform != "darwin"',
-  'cutensornet-cu13 ~= 2.11; sys_platform != "darwin"',
-  'cudensitymat-cu13 ~= 0.4; sys_platform != "darwin"',
+  'custatevec-cu13 ~= 1.13.1; sys_platform != "darwin"',
+  'cutensornet-cu13 ~= 2.12.1; sys_platform != "darwin"',
+  'cudensitymat-cu13 ~= 0.5.1; sys_platform != "darwin"',
   'nvidia-cublas ~= 13.0; sys_platform != "darwin"',
   'nvidia-curand ~= 10.4; sys_platform != "darwin"',
   'nvidia-cusparse ~= 12.6; sys_platform != "darwin"',
diff --git a/scripts/configure_build.sh b/scripts/configure_build.sh
index d065b1d2072..17d4cc36125 100644
--- a/scripts/configure_build.sh
+++ b/scripts/configure_build.sh
@@ -75,7 +75,7 @@ if [ "$1" == "install-cuquantum" ]; then
     CUDA_ARCH_FOLDER=$([ "$(uname -m)" == "aarch64" ] && echo sbsa || echo x86_64)
 
 # [>cuQuantumInstall]
-    CUQUANTUM_VERSION=26.01.0.4
+    CUQUANTUM_VERSION=26.03.1.9
     CUQUANTUM_DOWNLOAD_URL=https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum
 
     cuquantum_archive=cuquantum-linux-${CUDA_ARCH_FOLDER}-${CUQUANTUM_VERSION}_cuda$(echo ${CUDA_VERSION} | cut -d . -f1)-archive.tar.xz

From a45a06e7ae808ee245791602b39bf89a55069951 Mon Sep 17 00:00:00 2001
From: Pradnya Khalate <148914294+khalatepradnya@users.noreply.github.com>
Date: Sat, 18 Apr 2026 22:07:18 -0700
Subject: [PATCH 19/85] [codegen] Restore the OpenQASM2.0 pipeline (#4346)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Background

`cudaq.sample` with `set_target("braket")` fails on v0.14.0+ with:

RuntimeError: [line 10] cannot declare bit register. Only 1 bit
register(s) is/are supported

Amazon Braket's OpenQASM 2.0 parser enforces exactly one classical
register per circuit. The payload CUDA-Q emits for the Bell-state
reproducer in #4341 contains two.

## Root cause

`addPipelineTranslateToOpenQASM` (`lib/Optimizer/CodeGen/Pipelines.cpp`)
was refactored in #3693 to run `ExpandMeasurements` unconditionally. For
`qasm2` backends that run `combine-measurements` in the mid pipeline
(Braket, Scaleway, Quantum Machines), the sequence becomes:

1. Mid pipeline: `combine-measurements` merges per-qubit measurements
into a single `quake.mz` on the whole `!quake.veq` - the intent being
"emit one `creg` spanning all qubits".
2. Translate pipeline: `ExpandMeasurements` re-expands the combined `mz`
into one `mz` per qubit, then loop-unrolls.
3. OpenQASM2.0 emitter: writes one `creg` declaration per `mz`.

Target-specific YAML intent is silently overridden in the translate
pipeline.

## Fix

1. `lib/Optimizer/CodeGen/Pipelines.cpp`: revert
`addPipelineTranslateToOpenQASM` to the thin cleanup it was before
   #3693 . Each backend's YAML now drives measurement expansion.
2. `infleqtion.yml` and `tii.yml`: add `jit-high-level-pipeline:
"expand-measurements"`. These targets previously depended on the
unconditional expansion to get one `creg` per measured qubit; the
explicit entry preserves that behavior.
3. `test/Translate/OpenQASM/basic.qke` and
`test/Translate/openqasm2_*.cpp`: update CHECK lines to match the
single-`creg` output for a vector `mz` (which is what the emitter
produces after the fix).

## Impact

| Backend | creg count for `mz(qvector(n))` |
|---|---|
| Braket, Scaleway, Quantum Machines | 1 (single `creg` of size n) |
| Infleqtion, TII | n (preserved via new YAML entry) |
| Quantinuum, IQM, OQC, Anyon, QCI | n (unchanged; already had
`expand-measurements` in YAML) |

The change is scoped to `addPipelineTranslateToOpenQASM`, which only
runs for `codegen-emission: qasm2`. Simulators and non-OpenQASM2.0
backends are unaffected.

## Testing

- `ninja check-cudaq-mlir` passes with the updated CHECK lines.
- `cudaq.translate(kernel, format="openqasm2")` under `set_target(...)`
  for Braket, Scaleway, Infleqtion, TII — creg counts match the matrix
  above.
- Reproducer from #4341 now emits exactly the "expected" OpenQASM2.0
shown in the issue: `creg var3[2]; measure var0 -> var3;`.
- Manually tested against real servers: `test_braket.py`,
`test_Infleqtion.py`, `test_tii.py`, `test_scaleway.py`.

## Follow-up

An automated local test set up for OpenQASM payload validator will be
added in a separate PR.

Fixes #4341.

---------

Signed-off-by: Pradnya Khalate <pkhalate@nvidia.com>
---
 lib/Optimizer/CodeGen/Pipelines.cpp           |  1 -
 .../rest/helpers/infleqtion/infleqtion.yml    |  1 +
 .../platform/default/rest/helpers/tii/tii.yml |  1 +
 test/Translate/OpenQASM/basic.qke             | 45 +++++++++----------
 .../OpenQASM/combine_then_translate.qke       | 31 +++++++++++++
 test/Translate/openqasm2_adj_rotations.cpp    | 14 +-----
 test/Translate/openqasm2_loop.cpp             | 12 +----
 test/Translate/openqasm2_simple.cpp           |  6 +--
 test/Translate/openqasm2_vector.cpp           |  6 +--
 9 files changed, 62 insertions(+), 55 deletions(-)
 create mode 100644 test/Translate/OpenQASM/combine_then_translate.qke

diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index 43b26f15f2b..377b52b7797 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -171,7 +171,6 @@ void cudaq::opt::createPipelineTransformsForPythonToOpenQASM(
 }
 
 void cudaq::opt::addPipelineTranslateToOpenQASM(PassManager &pm) {
-  createCommonTargetCodegenPipeline</*isJIT=*/true>(pm, {});
   pm.addNestedPass<func::FuncOp>(createClassicalMemToReg());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<func::FuncOp>(createDeadStoreRemoval());
diff --git a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
index f377488eada..74bc3eaeac6 100644
--- a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
@@ -19,6 +19,7 @@ config:
   # Add preprocessor defines to compilation
   preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Define the JIT lowering pipeline
+  jit-high-level-pipeline: "expand-measurements"
   jit-mid-level-pipeline: "lower-to-cfg,decomposition{basis=h,s,t,r1,rx,ry,rz,x,y,z,x(1)},quake-to-cc-prep,func.func(memtoreg{quantum=0})"
   # Tell the rest-qpu that we are generating OpenQASM 2.0.
   codegen-emission: qasm2
diff --git a/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml b/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml
index 0de439667b2..ca33039a306 100644
--- a/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/tii/tii.yml
@@ -19,6 +19,7 @@ config:
   # Add preprocessor defines to compilation
   preprocessor-defines: ["-D CUDAQ_QUANTUM_DEVICE"]
   # Define the JIT lowering pipelines
+  jit-high-level-pipeline: "expand-measurements"
   jit-mid-level-pipeline: "lower-to-cfg,func.func(canonicalize,multicontrol-decomposition),decomposition{basis=h,s,t,r1,rx,ry,rz,x,y,z,x(1)},quake-to-cc-prep,func.func(expand-control-veqs,canonicalize),symbol-dce"
   codegen-emission: qasm2
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/test/Translate/OpenQASM/basic.qke b/test/Translate/OpenQASM/basic.qke
index 7cb8fd719b5..6db71117e66 100644
--- a/test/Translate/OpenQASM/basic.qke
+++ b/test/Translate/OpenQASM/basic.qke
@@ -95,28 +95,25 @@ module {
 // CHECK:   cx  q2, q0;
 // CHECK: }
 
-// CHECK: qreg var0[10];
-// CHECK: x var0[1];
-// CHECK: x var0[5];
-// CHECK: x var0[6];
-// CHECK: x var0[7];
-// CHECK: x var0[8];
-// CHECK: @maj var0[0], var0[5], var0[1];
-// CHECK: @maj var0[1], var0[6], var0[2];
-// CHECK: @maj var0[2], var0[7], var0[3];
-// CHECK: @maj var0[3], var0[8], var0[4];
-// CHECK: cx var0[4], var0[9];
-// CHECK: @umaj var0[3], var0[8], var0[4];
-// CHECK: @umaj var0[2], var0[7], var0[3];
-// CHECK: @umaj var0[1], var0[6], var0[2];
-// CHECK: @umaj var0[9], var0[5], var0[1];
-// CHECK: creg var11[1];
-// CHECK: measure var0[5] -> var11[0];
-// CHECK: creg var12[1];
-// CHECK: measure var0[6] -> var12[0];
+// CHECK: qreg var0[1];
+// CHECK: qreg var1[4];
+// CHECK: qreg var2[4];
+// CHECK: qreg var3[1];
+// CHECK: x var1[0];
+// CHECK: x var2[0];
+// CHECK: x var2[1];
+// CHECK: x var2[2];
+// CHECK: x var2[3];
+// CHECK: @maj var0[0], var2[0], var1[0];
+// CHECK: @maj var1[0], var2[1], var1[1];
+// CHECK: @maj var1[1], var2[2], var1[2];
+// CHECK: @maj var1[2], var2[3], var1[3];
+// CHECK: cx var1[3], var3[0];
+// CHECK: @umaj var1[2], var2[3], var1[3];
+// CHECK: @umaj var1[1], var2[2], var1[2];
+// CHECK: @umaj var1[0], var2[1], var1[1];
+// CHECK: @umaj var3[0], var2[0], var1[0];
+// CHECK: creg var12[4];
+// CHECK: measure var2 -> var12;
 // CHECK: creg var13[1];
-// CHECK: measure var0[7] -> var13[0];
-// CHECK: creg var14[1];
-// CHECK: measure var0[8] -> var14[0];
-// CHECK: creg var15[1];
-// CHECK: measure var0[9] -> var15[0];
+// CHECK: measure var3[0] -> var13[0];
diff --git a/test/Translate/OpenQASM/combine_then_translate.qke b/test/Translate/OpenQASM/combine_then_translate.qke
new file mode 100644
index 00000000000..3525af1516f
--- /dev/null
+++ b/test/Translate/OpenQASM/combine_then_translate.qke
@@ -0,0 +1,31 @@
+// ========================================================================== //
+// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -combine-measurements -canonicalize %s | cudaq-translate --convert-to=openqasm2 | FileCheck %s
+
+module {
+  func.func @bell() attributes {"cudaq-entrypoint"} {
+    %q = quake.alloca !quake.veq<2>
+    %q0 = quake.extract_ref %q[0] : (!quake.veq<2>) -> !quake.ref
+    %q1 = quake.extract_ref %q[1] : (!quake.veq<2>) -> !quake.ref
+    quake.h %q0 : (!quake.ref) -> ()
+    quake.x [%q0] %q1 : (!quake.ref, !quake.ref) -> ()
+    %m0 = quake.mz %q0 : (!quake.ref) -> !quake.measure
+    %m1 = quake.mz %q1 : (!quake.ref) -> !quake.measure
+    return
+  }
+}
+
+// CHECK: OPENQASM 2.0;
+// CHECK: include "qelib1.inc";
+// CHECK: qreg var0[2];
+// CHECK: h var0[0];
+// CHECK: cx var0[0], var0[1];
+// CHECK: creg var3[2];
+// CHECK: measure var0 -> var3;
+// CHECK-NOT: creg
diff --git a/test/Translate/openqasm2_adj_rotations.cpp b/test/Translate/openqasm2_adj_rotations.cpp
index 2f3b8ff25d7..24ae89dc954 100644
--- a/test/Translate/openqasm2_adj_rotations.cpp
+++ b/test/Translate/openqasm2_adj_rotations.cpp
@@ -59,15 +59,5 @@ int main() {
 // CHECK: x var0[1];
 // CHECK: ch var0[0], var0[1];
 // CHECK: rz(-6.700000e-01) var0[0];
-// CHECK: creg var7[1];
-// CHECK: measure var0[0] -> var7[0];
-// CHECK: creg var8[1];
-// CHECK: measure var0[1] -> var8[0];
-// CHECK: creg var9[1];
-// CHECK: measure var0[2] -> var9[0];
-// CHECK: creg var10[1];
-// CHECK: measure var0[3] -> var10[0];
-// CHECK: creg var11[1];
-// CHECK: measure var0[4] -> var11[0];
-// CHECK: creg var12[1];
-// CHECK: measure var0[5] -> var12[0];
+// CHECK: creg var7[6];
+// CHECK: measure var0 -> var7;
diff --git a/test/Translate/openqasm2_loop.cpp b/test/Translate/openqasm2_loop.cpp
index eda2c4e32cb..dd260cac988 100644
--- a/test/Translate/openqasm2_loop.cpp
+++ b/test/Translate/openqasm2_loop.cpp
@@ -53,13 +53,5 @@ int main() {
 // CHECK: cx var0[2], var0[3];
 // CHECK: cx var0[3], var0[4];
 // CHECK: ccx var0[0], var0[2], var0[1];
-// CHECK: creg var6[1];
-// CHECK: measure var0[0] -> var6[0];
-// CHECK: creg var7[1];
-// CHECK: measure var0[1] -> var7[0];
-// CHECK: creg var8[1];
-// CHECK: measure var0[2] -> var8[0];
-// CHECK: creg var9[1];
-// CHECK: measure var0[3] -> var9[0];
-// CHECK: creg var10[1];
-// CHECK: measure var0[4] -> var10[0];
+// CHECK: creg var6[5];
+// CHECK: measure var0 -> var6;
diff --git a/test/Translate/openqasm2_simple.cpp b/test/Translate/openqasm2_simple.cpp
index c8566a51d48..1d98483d5a5 100644
--- a/test/Translate/openqasm2_simple.cpp
+++ b/test/Translate/openqasm2_simple.cpp
@@ -36,7 +36,5 @@ int main() {
 // CHECK:  qreg var0[2];
 // CHECK:  h var0[0];
 // CHECK:  cx var0[0], var0[1];
-// CHECK:  creg var3[1];
-// CHECK:  measure var0[0] -> var3[0];
-// CHECK:  creg var4[1];
-// CHECK:  measure var0[1] -> var4[0];
+// CHECK:  creg var3[2];
+// CHECK:  measure var0 -> var3;
diff --git a/test/Translate/openqasm2_vector.cpp b/test/Translate/openqasm2_vector.cpp
index 1911e784d9d..183cdbcb1a9 100644
--- a/test/Translate/openqasm2_vector.cpp
+++ b/test/Translate/openqasm2_vector.cpp
@@ -40,7 +40,5 @@ int main() {
 // CHECK:  cx var0[1], var0[0];
 // CHECK:  ry(7.853982e-01) var0[0];
 // CHECK:  cx var0[1], var0[0];
-// CHECK:  creg var3[1];
-// CHECK:  measure var0[0] -> var3[0];
-// CHECK:  creg var4[1];
-// CHECK:  measure var0[1] -> var4[0];
+// CHECK:  creg var3[2];
+// CHECK:  measure var0 -> var3;

From 3e6c664be8aa419ef9478a853dbd4c896baf4aec Mon Sep 17 00:00:00 2001
From: Pradnya Khalate <148914294+khalatepradnya@users.noreply.github.com>
Date: Sun, 19 Apr 2026 14:57:28 -0700
Subject: [PATCH 20/85] Revert `measure_result` type distinction and
 `!quake.measurements` infrastructure (#4349)

## Summary

Reverts PRs
- #3800,
- #4204,
- #4208,
- #4266,
- #4267.

Following an architecture alignment meeting (Apr 17), we are changing
direction on how measurement results are represented in CUDA-Q. The
`measure_result` standalone class and `!quake.measurements<N>` Quake
type introduced by these PRs are being replaced by a new
`measure_handle` approach with fundamentally different semantics.

This revert restores:

* `measure_result` as a typedef to bool (compiler mode)
* Multi-qubit mz returning `!cc.stdvec<!quake.measure>`
* Removes `!quake.measurements<N>` type, `quake.get_measure`,
`quake.measurements_size` ops
* Removes `quake.relax_size` extension for measurements
* Removes `QIRResultArrayCreate` / `QIRResultArrayGetElementPtr1d` QIR
intrinsics
* Removes 8 test files added by the reverted PRs

### Forward direction (follow-up PRs):

New `measure_handle`

Signed-off-by: Pradnya Khalate <pkhalate@nvidia.com>
---
 .../cpp/basics/mid_circuit_measurement.cpp    |   2 +-
 .../sphinx/examples/cpp/measuring_kernels.cpp |   4 +-
 .../examples/cpp/sample_to_run_migration.cpp  |   2 +-
 .../Optimizer/CodeGen/QIRFunctionNames.h      |   4 -
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td | 124 +----
 .../Optimizer/Dialect/Quake/QuakeTypes.h      |   3 +-
 .../Optimizer/Dialect/Quake/QuakeTypes.td     |  35 --
 include/cudaq/Optimizer/Transforms/Passes.td  |  33 +-
 lib/Frontend/nvqpp/ASTBridge.cpp              |   6 +-
 lib/Frontend/nvqpp/ConvertDecl.cpp            |  36 +-
 lib/Frontend/nvqpp/ConvertExpr.cpp            | 145 +-----
 lib/Frontend/nvqpp/ConvertStmt.cpp            |  56 ---
 lib/Frontend/nvqpp/ConvertType.cpp            |  18 +-
 lib/Optimizer/Builder/Intrinsics.cpp          |   2 -
 lib/Optimizer/Builder/Marshal.cpp             |   8 +-
 lib/Optimizer/CodeGen/ConvertToExecMgr.cpp    |   3 +-
 lib/Optimizer/CodeGen/ConvertToQIR.cpp        |   6 +-
 lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp     | 371 ++------------
 lib/Optimizer/CodeGen/QuakeToExecMgr.cpp      |  24 +-
 lib/Optimizer/CodeGen/QuakeToLLVM.cpp         |  33 +-
 .../CodeGen/WireSetsToProfileQIR.cpp          |   3 -
 lib/Optimizer/Dialect/CC/CCOps.cpp            |   4 -
 .../Dialect/Quake/CanonicalPatterns.inc       | 114 -----
 lib/Optimizer/Dialect/Quake/QuakeOps.cpp      |  79 +--
 lib/Optimizer/Dialect/Quake/QuakeTypes.cpp    |  32 +-
 lib/Optimizer/Transforms/AddMeasurements.cpp  |  12 +-
 .../Transforms/CombineMeasurements.cpp        |  32 +-
 .../Transforms/ExpandMeasurements.cpp         | 359 +++-----------
 lib/Optimizer/Transforms/Mapping.cpp          |   2 -
 python/cudaq/kernel/ast_bridge.py             |  12 +-
 python/cudaq/kernel/kernel_builder.py         |  45 +-
 python/runtime/mlir/py_register_dialects.cpp  |  12 -
 python/tests/mlir/adjoint.py                  |   2 +-
 python/tests/mlir/bug_1777.py                 |   4 +-
 python/tests/mlir/call_qpu.py                 |   4 +-
 python/tests/mlir/measure.py                  |   6 +-
 python/tests/mlir/swap.py                     |   2 +-
 runtime/cudaq/builder/QuakeValue.cpp          |  19 +-
 runtime/cudaq/builder/kernel_builder.cpp      |  12 +-
 runtime/cudaq/qis/execution_manager.h         |  39 ++
 runtime/cudaq/qis/execution_manager_c_api.cpp |   2 +-
 runtime/cudaq/qis/measure_result.h            |  78 ---
 runtime/cudaq/qis/qubit_qis.h                 |  49 +-
 runtime/nvqir/QIRTypes.cpp                    |   8 -
 runtime/nvqir/QIRTypes.h                      |   8 +-
 targettests/Kernel/inline-qpu-func.cpp        |   5 +-
 targettests/Kernel/signature-0.cpp            |   4 +-
 targettests/execution/auto_kernel.cpp         |   2 +-
 targettests/execution/conditional_run.cpp     |   2 +-
 targettests/execution/cudaq_run.cpp           |  48 +-
 .../execution/cudaq_run_dynamic_result.cpp    |   8 +-
 targettests/execution/cudaq_run_emulation.cpp |  45 +-
 targettests/execution/qir_cond_for_loop-3.cpp |  11 +-
 .../qubit_management_if_classical.cpp         |   2 +-
 .../quantinuum/reset_after_measure.cpp        |   4 -
 test/AST-Quake/auto_kernel-1.cpp              |  19 +-
 test/AST-Quake/auto_kernel-2.cpp              |  17 +-
 test/AST-Quake/base_profile-0.cpp             |   1 -
 .../{qir_profiles.cpp => base_profile-1.cpp}  |  23 +-
 test/AST-Quake/bool_literal.cpp               |   8 +-
 test/AST-Quake/bug_3270.cpp                   |  19 +-
 test/AST-Quake/call_qpu.cpp                   |  29 +-
 test/AST-Quake/cast.cpp                       | 156 +++---
 test/AST-Quake/const_reference_extension.cpp  |  37 +-
 test/AST-Quake/control_flow.cpp               |   8 +-
 test/AST-Quake/ctrl_vector.cpp                |   2 +-
 test/AST-Quake/cudaq_run.cpp                  |   6 +-
 test/AST-Quake/cudaq_types.cpp                |   2 +-
 test/AST-Quake/grover.cpp                     |   2 +-
 test/AST-Quake/if.cpp                         |  68 +--
 test/AST-Quake/indirect_callable.cpp          |   2 +-
 test/AST-Quake/loop_unroll-1.cpp              |  22 +-
 test/AST-Quake/loop_unroll-3.cpp              |   2 +-
 test/AST-Quake/measure_bell.cpp               | 120 ++---
 test/AST-Quake/measure_result_assign.cpp      |  21 -
 test/AST-Quake/measure_result_compare.cpp     |  81 ----
 .../AST-Quake/measure_result_device_entry.cpp |  38 --
 test/AST-Quake/mz.cpp                         | 122 ++---
 test/AST-Quake/qalloc_initialization.cpp      |  14 +-
 test/AST-Quake/qalloc_state.cpp               |   8 +-
 test/AST-Quake/qpe.cpp                        |   2 +-
 test/AST-Quake/reset_after_measure.cpp        |  10 +-
 test/AST-Quake/separate_compilation.cpp       |  14 +-
 test/AST-Quake/simple.cpp                     |   2 +-
 test/AST-Quake/simple_qarray.cpp              |   2 +-
 test/AST-Quake/slice.cpp                      |  20 +-
 test/AST-Quake/ternary.cpp                    |  75 +--
 test/AST-Quake/to_integer.cpp                 |  12 +-
 test/AST-Quake/to_qir.cpp                     |  57 +--
 test/AST-Quake/tuple-0.cpp                    |  10 +-
 test/AST-Quake/vector-0.cpp                   |   6 +-
 test/AST-Quake/vector_bool.cpp                |  50 +-
 test/AST-Quake/veq_size_init_state.cpp        |   2 +-
 test/AST-error/run_struct_of_vec.cpp          |   4 +-
 test/Transforms/add_measurements-0.qke        |   2 +-
 test/Transforms/add_measurements-1.qke        |   4 +-
 test/Transforms/combine_measurements.qke      |  24 +-
 .../convert_to_qir_measurements.qke           | 180 -------
 test/Transforms/cse.qke                       |  24 +-
 .../expand_and_qir_measurements.qke           |  89 ----
 test/Transforms/expand_measurements.qke       | 451 ------------------
 test/Transforms/invalid.qke                   |  33 --
 test/Transforms/kernel_exec-1.qke             |   3 +-
 test/Transforms/loop.qke                      |   2 +-
 test/Transforms/mapping_non_unitaries.qke     |   4 +-
 test/Transforms/measurements_size.qke         |  57 ---
 test/Transforms/memtoreg-2.qke                |  40 +-
 test/Transforms/memtoreg-3.qke                |   6 +-
 test/Transforms/memtoreg-7.qke                |  76 +--
 test/Transforms/mz.qke                        |   8 +-
 test/Transforms/propagate_metadata_apply.qke  |   4 +-
 test/Transforms/quake-errors.qke              |   8 -
 test/Transforms/resource_count_preprocess.qke |   4 +-
 test/Transforms/return_vector.qke             |  17 +-
 test/Transforms/roundtrip-ops.qke             |  39 +-
 test/Translate/OpenQASM/basic.qke             |   2 +-
 test/Translate/argument.qke                   |   9 +-
 test/Translate/return_values.qke              |  89 ++--
 unittests/common/MeasureCountsTester.cpp      |  39 --
 unittests/qir/NVQIRTester.cpp                 |  13 -
 120 files changed, 892 insertions(+), 3413 deletions(-)
 delete mode 100644 runtime/cudaq/qis/measure_result.h
 rename test/AST-Quake/{qir_profiles.cpp => base_profile-1.cpp} (97%)
 delete mode 100644 test/AST-Quake/measure_result_assign.cpp
 delete mode 100644 test/AST-Quake/measure_result_compare.cpp
 delete mode 100644 test/AST-Quake/measure_result_device_entry.cpp
 delete mode 100644 test/Transforms/convert_to_qir_measurements.qke
 delete mode 100644 test/Transforms/expand_and_qir_measurements.qke
 delete mode 100644 test/Transforms/expand_measurements.qke
 delete mode 100644 test/Transforms/measurements_size.qke

diff --git a/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp b/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp
index 36c548b6b5a..ce987a92c39 100644
--- a/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp
+++ b/docs/sphinx/examples/cpp/basics/mid_circuit_measurement.cpp
@@ -6,7 +6,7 @@
 #include <cudaq.h>
 
 struct kernel {
-  bool operator()() __qpu__ {
+  auto operator()() __qpu__ {
     cudaq::qarray<3> q;
     // Initial state preparation
     x(q[0]);
diff --git a/docs/sphinx/examples/cpp/measuring_kernels.cpp b/docs/sphinx/examples/cpp/measuring_kernels.cpp
index 53ee6361326..0c4664c042b 100644
--- a/docs/sphinx/examples/cpp/measuring_kernels.cpp
+++ b/docs/sphinx/examples/cpp/measuring_kernels.cpp
@@ -27,7 +27,7 @@ __qpu__ void kernel1() {
 // [End Sample2]
 
 // [Begin Run0]
-__qpu__ std::vector<bool> kernel2() {
+__qpu__ auto kernel2() {
   cudaq::qvector q(2);
   h(q[0]);
   auto b0 = mz(q[0]);
@@ -37,7 +37,7 @@ __qpu__ std::vector<bool> kernel2() {
   if (b0) {
     h(q[1]);
   }
-  return cudaq::to_bool_vector(mz(q));
+  return mz(q);
 }
 
 int main() {
diff --git a/docs/sphinx/examples/cpp/sample_to_run_migration.cpp b/docs/sphinx/examples/cpp/sample_to_run_migration.cpp
index fe557aa25c9..7ce52822b91 100644
--- a/docs/sphinx/examples/cpp/sample_to_run_migration.cpp
+++ b/docs/sphinx/examples/cpp/sample_to_run_migration.cpp
@@ -31,7 +31,7 @@ __qpu__ void reset_pattern() {
 
 // [Begin Example1]
 struct simple_conditional {
-  bool operator()() __qpu__ {
+  auto operator()() __qpu__ {
     cudaq::qvector q(2);
     h(q[0]);
     auto r = mz(q[0]);
diff --git a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
index b471ceebb23..56d6006b407 100644
--- a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
+++ b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
@@ -81,10 +81,6 @@ static constexpr const char QIRArrayConcatArray[] =
     "__quantum__rt__array_concatenate";
 static constexpr const char QIRArrayCreateArray[] =
     "__quantum__rt__array_create_1d";
-static constexpr const char QIRResultArrayCreate[] =
-    "__quantum__rt__result_array_create_1d";
-static constexpr const char QIRResultArrayGetElementPtr1d[] =
-    "__quantum__rt__result_array_get_element_ptr_1d";
 
 /// Dynamic qubit management helper functions. These are currently only used by
 /// the NVQIR simulator.
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index 4edc20ca348..a1f1ec77d5c 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -261,21 +261,21 @@ def quake_ExtractRefOp : QuakeOp<"extract_ref", [Pure]> {
 }
 
 def quake_RelaxSizeOp : QuakeOp<"relax_size", [Pure]> {
-  let summary = "Relax the constant size on a sized type to be unknown.";
+  let summary = "Relax the constant size on a !veq to be unknown.";
   let description = [{
-    Demotes a sized `!quake.veq<N>` to `!quake.veq<?>`, or a sized
-    `!quake.measurements<N>` to `!quake.measurements<?>`. Required to preserve
-    strongly-typed IR at function call/return boundaries.
+    At times, the IR needs to forget the length of an SSA-value of type
+    `!quake.veq<N>` and demote it to type `!quake.veq<?>` where the size is
+    said to be unknown. This demotion is required to preserve a valid,
+    strongly-typed IR.
 
-    Examples:
+    Example:
     ```mlir
       %uqv = quake.relax_size %qv : (!quake.veq<4>) -> !quake.veq<?>
-      %ums = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.measurements<?>
     ```
   }];
 
-  let arguments = (ins AnyTypeOf<[VeqType, MeasurementsType]>:$inputVec);
-  let results = (outs AnyTypeOf<[VeqType, MeasurementsType]>);
+  let arguments = (ins VeqType:$inputVec);
+  let results = (outs VeqType);
 
   let assemblyFormat = [{
     $inputVec `:` functional-type(operands, results) attr-dict
@@ -377,34 +377,6 @@ def quake_VeqSizeOp : QuakeOp<"veq_size", [Pure]> {
   let hasCanonicalizer = 1;
 }
 
-def quake_MeasurementsSizeOp : QuakeOp<"measurements_size", [Pure]> {
-  let summary = "Return the size of a measurements collection.";
-  let description = [{
-    Returns the number of individual measurements in a `!quake.measurements<N>`
-    collection. If the collection has a static size, the static size is returned
-    (effectively as a constant). If the size is dynamic, the value will be an
-    SSA-value.
-
-    Examples:
-    ```mlir
-      %ms = quake.mz %qubits : (!quake.veq<4>) -> !quake.measurements<4>
-      %n = quake.measurements_size %ms : (!quake.measurements<4>) -> i64
-      
-      %ms2 = quake.mz %dyn_veq : (!quake.veq<?>) -> !quake.measurements<?>
-      %n2 = quake.measurements_size %ms2 : (!quake.measurements<?>) -> i64
-    ```
-  }];
-
-  let arguments = (ins MeasurementsType:$measurements);
-  let results = (outs AnySignlessIntegerOrIndex:$size);
-
-  let assemblyFormat = [{
-    $measurements `:` functional-type(operands, results) attr-dict
-  }];
-
-  let hasCanonicalizer = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // Application, ComputeAction(Uncompute)
 //===----------------------------------------------------------------------===//
@@ -1094,7 +1066,7 @@ class Measurement<string mnemonic> : QuakeOp<mnemonic, [MeasurementInterface,
     OptionalAttr<StrAttr>:$registerName
   );
   let results = (outs
-    AnyTypeOf<[MeasureType, MeasurementsType]>:$measOut,
+    AnyTypeOf<[MeasureType, StdvecOf<[MeasureType]>]>:$measOut,
     Variadic<WireType>:$wires
   );
 
@@ -1111,15 +1083,13 @@ class Measurement<string mnemonic> : QuakeOp<mnemonic, [MeasurementInterface,
   }];
 
   let hasVerifier = 1;
-  let hasCanonicalizer = 1;
 }
 
 def MxOp : Measurement<"mx"> {
   let summary = "Measurement along the x-axis";
   let description = [{
-    The `mx` operation measures the state of qubits along the x-axis. For a
-    single qubit the result is `!quake.measure`; for multiple qubits the result
-    is `!quake.measurements<N>` (or `!quake.measurements<?>` when unsized).
+    The `mx` operation measures the state of qubits into classical bits
+    represented by a `i1` (or a vector of `i1`), along the x-axis.
 
     The state of the qubits is collapsed into one of the computational basis
     states, i.e., either |0> or |1>. A `reset` operation can guarantee that the
@@ -1132,9 +1102,8 @@ def MxOp : Measurement<"mx"> {
 def MyOp : Measurement<"my"> {
   let summary = "Measurement along the y-axis";
   let description = [{
-    The `my` operation measures the state of qubits along the y-axis. For a
-    single qubit the result is `!quake.measure`; for multiple qubits the result
-    is `!quake.measurements<N>` (or `!quake.measurements<?>` when unsized).
+    The `my` operation measures the state of qubits into classical bits
+    represented by a `i1` (or a vector of `i1`), along the y-axis.
 
     The state of the qubit is collapsed into one of the computational basis
     states, i.e., either |0> or |1>. A `reset` operation can guarantee that the
@@ -1147,10 +1116,9 @@ def MyOp : Measurement<"my"> {
 def MzOp : Measurement<"mz"> {
   let summary = "Measurement along the z-axis";
   let description = [{
-    The `mz` operation measures the state of qubits along the z-axis---the
-    so-called computational basis. For a single qubit the result is
-    `!quake.measure`; for multiple qubits the result is
-    `!quake.measurements<N>` (or `!quake.measurements<?>` when unsized).
+    The `mz` operation measures the state of qubits into a classical bits
+    represented by a `i1` (or a vector of `i1`), along the z-axis---the
+    so-called computational basis.
 
     The state of the qubit is collapsed into one of the computational basis
     states, i.e., either |0> or |1>. A `reset` operation can guarantee that the
@@ -1175,7 +1143,7 @@ def quake_DiscriminateOp : QuakeOp<"discriminate", [Pure]> {
   }];
 
   let arguments = (ins
-    AnyTypeOf<[MeasureType, MeasurementsType]>:$measurement
+    AnyTypeOf<[MeasureType, StdvecOf<[MeasureType]>]>:$measurement
   );
   let results = (outs
     AnyTypeOf<[AnySignlessInteger, StdvecOf<[AnySignlessInteger]>]>
@@ -1188,64 +1156,6 @@ def quake_DiscriminateOp : QuakeOp<"discriminate", [Pure]> {
   let hasVerifier = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// GetMeasureOp
-//===----------------------------------------------------------------------===//
-
-def quake_GetMeasureOp : QuakeOp<"get_measure", [Pure]> {
-  let summary =
-    "Extract a single measurement from a measurements collection.";
-  let description = [{
-    Extracts a single `!quake.measure` value from a `!quake.measurements<N>`
-    collection by index. This is analogous to `quake.extract_ref` for qubits.
-
-    Example:
-    ```mlir
-      %m = quake.get_measure %ms[0] : (!quake.measurements<4>) -> !quake.measure
-    ```
-  }];
-
-  let arguments = (ins
-    MeasurementsType:$measurements,
-    Optional<AnySignlessIntegerOrIndex>:$index,
-    I64Attr:$rawIndex
-  );
-  let results = (outs MeasureType:$measure);
-
-  let builders = [
-    OpBuilder<(ins "mlir::Value":$measurements, "mlir::Value":$index,
-                   "mlir::IntegerAttr":$rawIndex), [{
-      return build($_builder, $_state, $_builder.getType<MeasureType>(),
-                   measurements, index, rawIndex);
-    }]>,
-    OpBuilder<(ins "mlir::Value":$measurements, "mlir::Value":$index), [{
-      return build($_builder, $_state, $_builder.getType<MeasureType>(),
-                   measurements, index, GetMeasureOp::kDynamicIndex);
-    }]>,
-    OpBuilder<(ins "mlir::Value":$measurements, "std::size_t":$rawIndex), [{
-      auto i64Ty = $_builder.getI64Type();
-      return build($_builder, $_state, $_builder.getType<MeasureType>(),
-                   measurements, mlir::Value{},
-                   mlir::IntegerAttr::get(i64Ty, rawIndex));
-    }]>
-  ];
-
-  let assemblyFormat = [{
-    $measurements `[` custom<RawIndex>($index, $rawIndex) `]` `:`
-      functional-type(operands, results) attr-dict
-  }];
-
-  let hasVerifier = 1;
-
-  let extraClassDeclaration = [{
-    static constexpr std::size_t kDynamicIndex =
-      std::numeric_limits<std::size_t>::max();
-
-    bool hasConstantIndex() { return !getIndex(); }
-    std::size_t getConstantIndex() { return getRawIndex(); }
-  }];
-}
-
 //===----------------------------------------------------------------------===//
 // Quantum gates
 //===----------------------------------------------------------------------===//
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h
index 3bc91c21479..6c0d3ff51ed 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.h
@@ -31,8 +31,7 @@ inline bool isQuantumType(mlir::Type ty) {
 /// \returns true if \p `ty` is a Quake type.
 inline bool isQuakeType(mlir::Type ty) {
   // This should correspond to the registered types in QuakeTypes.cpp.
-  return isQuantumType(ty) ||
-         mlir::isa<quake::MeasureType, quake::MeasurementsType>(ty);
+  return isQuantumType(ty) || mlir::isa<quake::MeasureType>(ty);
 }
 
 /// \returns true if \p ty is a quantum reference type, excluding `struq`.
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
index 542f4861069..2f4f98cf0fd 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
@@ -251,41 +251,6 @@ def MeasureType : QuakeType<"Measure", "measure"> {
   let genStorageClass = 0;
 }
 
-//===----------------------------------------------------------------------===//
-// MeasurementsType: classical data type for a collection of measurements
-//===----------------------------------------------------------------------===//
-
-def MeasurementsType : QuakeType<"Measurements", "measurements"> {
-  let summary = "a sequence of measurement results";
-  let description = [{
-    A value of type `measurements` is a collection of values of type `measure`.
-    This is the natural result type of measuring multiple qubits. Like `veq` is
-    to `ref`, `measurements` is to `measure`.
-
-    ```mlir
-      %ms = quake.mz %qubits : (!quake.veq<4>) -> !quake.measurements<4>
-      %m0 = quake.get_measure %ms[0] : (!quake.measurements<4>) -> !quake.measure
-    ```
-  }];
-
-  let parameters = (ins "std::size_t":$size);
-
-  let hasCustomAssemblyFormat = 1;
-
-  let extraClassDeclaration = [{
-    static constexpr std::size_t kDynamicSize =
-      std::numeric_limits<std::size_t>::max();
-    
-    bool hasSpecifiedSize() const { return getSize() != kDynamicSize; }
-    bool hasNonZeroSpecifiedSize() const {
-      return hasSpecifiedSize() && getSize();
-    }
-    static MeasurementsType getUnsized(mlir::MLIRContext *ctx) {
-      return MeasurementsType::get(ctx, kDynamicSize);
-    }
-  }];
-}
-
 //===----------------------------------------------------------------------===//
 // StateType
 //===----------------------------------------------------------------------===//
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 98a3a0ec3ff..df9890e47b3 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -219,7 +219,7 @@ def CombineMeasurements :
       %1 = ... : !quake.veq<4>
       %2 = quake.subveq %1, %c2, %c3 : (!quake.veq<4>, i32, i32) ->
             !quake.veq<2>
-      %measOut = quake.mz %2 : (!quake.veq<2>) -> !quake.measurements<2>
+      %measOut = quake.mz %2 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     }
     ```
     with:
@@ -227,7 +227,7 @@ def CombineMeasurements :
     func.func @kernel() attributes {"cudaq-entrypoint", ["output_names",
     "[[[0,[1,\22q0\22]],[1,[2,\22q1\22]]]]"]} {
       %1 = ... : !quake.veq<4>
-      %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+      %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
     }
     ```
   }];
@@ -478,30 +478,15 @@ def ExpandControlVeqs: Pass<"expand-control-veqs", "mlir::func::FuncOp"> {
 }
 
 def ExpandMeasurements : Pass<"expand-measurements"> {
-  let summary = "Expand multi-qubit measurements to individual qubit ops.";
+  let summary = "Expand multi-ref measurements to series on single refs.";
   let description = [{
-    The `mx`, `my`, `mz` ops can take a list of qubits and/or veq arguments
-    and return a `!quake.measurements<N>` collection. The target may only
-    support measuring a single qubit however. This pass expands these
-    multi-qubit measurements into individual single-qubit measurements in two
-    steps.
-
-    Step 1: Any `quake.discriminate` on a sized `!quake.measurements<N>`
-    value is expanded into N individual `quake.get_measure` +
-    `quake.discriminate` operations on single `!quake.measure` values, with
-    results collected into a `!cc.stdvec<iK>`.
-
-    Step 2: Multi-qubit `mx`, `my`, `mz` ops are replaced with individual
-    per-qubit measurements. For `veq<N>` targets with known size, the qubits
-    are extracted via `quake.extract_ref` and measured individually. For
-    `veq<?>` targets with dynamically-sized targets, a loop is generated
-    using `quake.veq_size` to compute the iteration count and individual
-    qubits are extracted via `quake.extract_ref` within the loop body.
-    
-    Multi-qubit measurements without local discriminate users are left intact.
+    The `mx`, `my`, `mz` ops can take a list of qubits and/or veq arguments.
+    The target may only support measuring a single qubit however. This pass
+    expands these ops in list format into a series of measurements (including
+    loops) on individual qubits and into a single `std::vector<bool>` result.
 
-    The `reset` op can also take a veq argument and this pass will expand that
-    to a loop of `reset` operations on individual qubits.
+    The `reset` op can also take a veq argument and this pass will also expand
+    that to a series of `reset` operations on single qubits.
   }];
 
   let dependentDialects = ["cudaq::cc::CCDialect", "mlir::LLVM::LLVMDialect"];
diff --git a/lib/Frontend/nvqpp/ASTBridge.cpp b/lib/Frontend/nvqpp/ASTBridge.cpp
index c3ac466c2e9..724b13e16e7 100644
--- a/lib/Frontend/nvqpp/ASTBridge.cpp
+++ b/lib/Frontend/nvqpp/ASTBridge.cpp
@@ -62,8 +62,8 @@ static bool isQubitType(Type ty) {
 }
 
 // Check the builtin type FunctionType to see if it has any references to Quake
-// types (including measurement) in its arguments and/or results.
-static bool hasAnyQuakeTypes(FunctionType funcTy) {
+// qubit types in its arguments and/or results.
+static bool hasAnyQubitTypes(FunctionType funcTy) {
   for (auto ty : funcTy.getInputs())
     if (isQubitType(ty))
       return true;
@@ -639,7 +639,7 @@ void ASTBridgeAction::ASTBridgeConsumer::HandleTranslationUnit(
       auto unitAttr = UnitAttr::get(ctx);
       // Flag func as a quantum kernel.
       func->setAttr(kernelAttrName, unitAttr);
-      if ((!hasAnyQuakeTypes(func.getFunctionType())) &&
+      if ((!hasAnyQubitTypes(func.getFunctionType())) &&
           (!cudaq::ASTBridgeAction::ASTBridgeConsumer::isCustomOpGenerator(
               fdPair.second))) {
         // Flag func as an entry point to a quantum kernel.
diff --git a/lib/Frontend/nvqpp/ConvertDecl.cpp b/lib/Frontend/nvqpp/ConvertDecl.cpp
index 26ab9af5a25..fd01e4b5ec4 100644
--- a/lib/Frontend/nvqpp/ConvertDecl.cpp
+++ b/lib/Frontend/nvqpp/ConvertDecl.cpp
@@ -93,9 +93,8 @@ void QuakeBridgeVisitor::addArgumentSymbols(
       auto parmTy = entryBlock->getArgument(index).getType();
       if (isa<FunctionType, cc::CallableType, cc::IndirectCallableType,
               cc::PointerType, cc::SpanLikeType, LLVM::LLVMStructType,
-              quake::ControlType, quake::MeasureType, quake::MeasurementsType,
-              quake::RefType, quake::StruqType, quake::VeqType,
-              quake::WireType>(parmTy)) {
+              quake::ControlType, quake::RefType, quake::StruqType,
+              quake::VeqType, quake::WireType>(parmTy)) {
         symbolTable.insert(name, entryBlock->getArgument(index));
       } else {
         auto stackSlot = builder.create<cc::AllocaOp>(loc, parmTy);
@@ -170,9 +169,6 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
       auto fnTy = cast<FunctionType>(popType());
       return pushType(cc::IndirectCallableType::get(fnTy));
     }
-    // Measurement result type.
-    if (name == "measure_result")
-      return pushType(quake::MeasureType::get(ctx));
     if (!isInNamespace(x, "solvers") && !isInNamespace(x, "qec")) {
       auto loc = toLocation(x);
       TODO_loc(loc, "unhandled type, " + name + ", in cudaq namespace");
@@ -192,10 +188,6 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
                               "std::vector element type is not supported");
         return false;
       }
-      // TODO: std::vector<measure_result> will be replaced by
-      // cudaq::measure_vector, recognized directly by class name (see spec).
-      if (isa<quake::MeasureType>(ty))
-        return pushType(quake::MeasurementsType::getUnsized(ctx));
       return pushType(cc::StdvecType::get(ctx, ty));
     }
     // std::vector<bool>   =>   cc.stdvec<i1>
@@ -740,14 +732,7 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
     return true;
   }
 
-  if (isa<quake::MeasureType, quake::MeasurementsType>(type)) {
-    assert(x->getInit() && "`measure_result` has no default constructor");
-    auto initVal = popValue();
-    symbolTable.insert(x->getName(), initVal);
-    if (auto meas = initVal.getDefiningOp<quake::MeasurementInterface>())
-      meas.setRegisterName(builder.getStringAttr(x->getName()));
-    return true;
-  }
+  // Here we maybe have something like auto var = mz(qreg)
   if (auto vecType = dyn_cast<cc::StdvecType>(type)) {
     // Variable is of !cc.stdvec type.
     if (x->getInit()) {
@@ -759,11 +744,6 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
       // and if so, find the mz and tag it with the variable name
       auto elementType = vecType.getElementType();
 
-      if (auto meas = initVec.getDefiningOp<quake::MeasurementInterface>()) {
-        meas.setRegisterName(builder.getStringAttr(x->getName()));
-        return true;
-      }
-
       // Drop out if this is not an i1
       if (!elementType.isIntOrFloat() ||
           elementType.getIntOrFloatBitWidth() != 1)
@@ -801,11 +781,6 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
         auto firstGepUser = *gepOp->getResult(0).getUsers().begin();
         if (auto storeOp = dyn_cast<cc::StoreOp>(firstGepUser)) {
           auto result = storeOp->getOperand(0);
-          if (auto measureOp =
-                  result.getDefiningOp<quake::MeasurementInterface>()) {
-            measureOp.setRegisterName(builder.getStringAttr(x->getName()));
-            break;
-          }
           if (auto discr = result.getDefiningOp<quake::DiscriminateOp>())
             if (auto mzOp =
                     discr.getMeasurement().getDefiningOp<quake::MzOp>()) {
@@ -842,8 +817,9 @@ bool QuakeBridgeVisitor::VisitVarDecl(clang::VarDecl *x) {
 
   // If this was an auto var = mz(q), then we want to know the
   // var name, as it will serve as the classical bit register name
-  if (auto meas = initValue.getDefiningOp<quake::MeasurementInterface>())
-    meas.setRegisterName(builder.getStringAttr(x->getName()));
+  if (auto discr = initValue.getDefiningOp<quake::DiscriminateOp>())
+    if (auto mz = discr.getMeasurement().getDefiningOp<quake::MzOp>())
+      mz.setRegisterName(builder.getStringAttr(x->getName()));
 
   assert(initValue && "initializer value must be lowered");
   if (isa<IntegerType>(initValue.getType()) && isa<IntegerType>(type)) {
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index a2ee180f508..70aaf25f990 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -555,13 +555,6 @@ SmallVector<Value> QuakeBridgeVisitor::convertKernelArgs(
           continue;
         }
       }
-    if (auto vMeasTy = dyn_cast<quake::MeasurementsType>(vTy))
-      if (auto kMeasTy = dyn_cast<quake::MeasurementsType>(kTy))
-        if (vMeasTy.hasSpecifiedSize() && !kMeasTy.hasSpecifiedSize()) {
-          auto relax = builder.create<quake::RelaxSizeOp>(loc, kMeasTy, v);
-          result.push_back(relax);
-          continue;
-        }
 
     LLVM_DEBUG(llvm::dbgs() << "convert: " << v << "\nto:" << kTy << '\n');
     TODO_loc(loc, "argument type conversion");
@@ -665,7 +658,7 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
   }
   case clang::CastKind::CK_UserDefinedConversion: {
     auto sub = popValue();
-    // castToTy is the conversion function signature.
+    // castToTy is the converion function signature.
     castToTy = popType();
     if (isa<IntegerType>(castToTy) && isa<IntegerType>(sub.getType())) {
       auto locSub = toLocation(x->getSubExpr());
@@ -673,29 +666,6 @@ bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
       assert(result && "integer conversion failed");
       return result;
     }
-    auto i1Type = builder.getI1Type();
-    // Handle conversion of `measure_result`
-    auto measTy = quake::MeasureType::get(builder.getContext());
-    if (sub.getType() == measTy) {
-      auto i1Val = builder.create<quake::DiscriminateOp>(loc, i1Type, sub);
-      // Convert to `int`
-      if (isa<IntegerType>(castToTy))
-        return pushValue(builder.create<cudaq::cc::CastOp>(
-            loc, castToTy, i1Val, cudaq::cc::CastOpMode::Unsigned));
-      // Convert to `float`
-      if (isa<FloatType>(castToTy))
-        return pushValue(builder.create<cudaq::cc::CastOp>(
-            loc, castToTy, i1Val, cudaq::cc::CastOpMode::Unsigned));
-      // Otherwise, just return the `i1` value
-      return pushValue(i1Val);
-    }
-
-    // Handle conversion of measurement collection to std::vector<bool>.
-    // TODO: will become measure_vector::operator std::vector<bool>().
-    if (isa<quake::MeasurementsType>(sub.getType()))
-      return pushValue(builder.create<quake::DiscriminateOp>(
-          loc, cc::StdvecType::get(i1Type), sub));
-
     TODO_loc(loc, "unhandled user-defined implicit conversion");
   }
   case clang::CastKind::CK_ConstructorConversion: {
@@ -1045,7 +1015,7 @@ bool QuakeBridgeVisitor::VisitMaterializeTemporaryExpr(
   // In those cases, there is nothing to materialize, so we can just pass the
   // Value on the top of the stack.
   if (isa<cc::CallableType, quake::VeqType, quake::RefType, cc::SpanLikeType,
-          quake::StateType, quake::MeasureType, quake::MeasurementsType>(ty))
+          quake::StateType>(ty))
     return true;
 
   // If not one of the above special cases, then materialize the value to a
@@ -1311,14 +1281,6 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     auto svec = popValue();
     if (isa<cc::PointerType>(svec.getType()))
       svec = builder.create<cc::LoadOp>(loc, svec);
-    if (isa<quake::MeasurementsType>(svec.getType()) && funcName == "size")
-      if (auto memberCall = dyn_cast<clang::CXXMemberCallExpr>(x))
-        if (memberCall->getImplicitObjectArgument()) {
-          [[maybe_unused]] auto calleeTy = popType();
-          assert(isa<FunctionType>(calleeTy));
-          return pushValue(builder.create<quake::MeasurementsSizeOp>(
-              loc, builder.getI64Type(), svec));
-        }
     auto ext =
         builder.create<cc::StdvecSizeOp>(loc, builder.getI64Type(), svec);
     if (funcName == "size")
@@ -1558,38 +1520,10 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
   auto funcArity = func->getNumParams();
   SmallVector<Value> args = lastValues(funcArity);
   if (isa<clang::CXXMethodDecl>(func)) {
-    auto thisPtrValue = popValue();
-
-    // For `measure_result`, the implicit "this" value is the `!quake.measure`
-    // SSA value; forward it unchanged
-    if (isa<clang::CXXConversionDecl>(func) &&
-        isInClassInNamespace(func, "measure_result", "cudaq"))
-      return pushValue(thisPtrValue);
+    [[maybe_unused]] auto thisPtrValue = popValue();
   }
   auto calleeOp = popValue();
 
-  // Handle operator== and operator!= for measure_result (friend functions)
-  if (func->isOverloadedOperator() && isInNamespace(func, "cudaq")) {
-    auto opKind = func->getOverloadedOperator();
-    if ((opKind == clang::OO_EqualEqual || opKind == clang::OO_ExclaimEqual) &&
-        args.size() == 2) {
-      auto lhs = args[0];
-      auto rhs = args[1];
-      auto measTy = quake::MeasureType::get(builder.getContext());
-      if (lhs.getType() == measTy || rhs.getType() == measTy) {
-        auto i1Type = builder.getI1Type();
-        if (lhs.getType() == measTy)
-          lhs = builder.create<quake::DiscriminateOp>(loc, i1Type, lhs);
-        if (rhs.getType() == measTy)
-          rhs = builder.create<quake::DiscriminateOp>(loc, i1Type, rhs);
-        // Choose predicate based on operator
-        auto pred = (opKind == clang::OO_EqualEqual) ? arith::CmpIPredicate::eq
-                                                     : arith::CmpIPredicate::ne;
-        return pushValue(builder.create<arith::CmpIOp>(loc, pred, lhs, rhs));
-      }
-    }
-  }
-
   if (isInNamespace(func, "cudaq")) {
     // Check and see if this quantum operation is adjoint
     bool isAdjoint = false;
@@ -1712,33 +1646,25 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
     }
 
     if (funcName == "mx" || funcName == "my" || funcName == "mz") {
-      bool useMeasurements =
+      // Measurements always return a bool or a std::vector<bool>.
+      bool useStdvec =
           (args.size() > 1) ||
           (args.size() == 1 && isa<quake::VeqType>(args[0].getType()));
       auto measure = [&]() -> Value {
         Type measTy = quake::MeasureType::get(builder.getContext());
-        if (useMeasurements) {
-          std::size_t totalSize = 0;
-          bool allKnown = true;
-          for (auto a : args) {
-            if (quake::isConstantQuantumRefType(a.getType()))
-              totalSize += quake::getAllocationSize(a.getType());
-            else
-              allKnown = false;
-          }
-          if (allKnown && totalSize > 0)
-            measTy =
-                quake::MeasurementsType::get(builder.getContext(), totalSize);
-          else
-            measTy = quake::MeasurementsType::getUnsized(builder.getContext());
-        }
+        if (useStdvec)
+          measTy = cc::StdvecType::get(measTy);
         if (funcName == "mx")
           return builder.create<quake::MxOp>(loc, measTy, args).getMeasOut();
         if (funcName == "my")
           return builder.create<quake::MyOp>(loc, measTy, args).getMeasOut();
         return builder.create<quake::MzOp>(loc, measTy, args).getMeasOut();
       }();
-      return pushValue(measure);
+      Type resTy = builder.getI1Type();
+      if (useStdvec)
+        resTy = cc::StdvecType::get(resTy);
+      return pushValue(
+          builder.create<quake::DiscriminateOp>(loc, resTy, measure));
     }
 
     // Handle the quantum gate set.
@@ -2198,39 +2124,16 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       return true;
     }
 
-    // TODO: will be replaced by measure_vector::operator std::int64_t().
     if (funcName == "toInteger" || funcName == "to_integer") {
-      auto arg = args[0];
-      auto i1Ty = builder.getI1Type();
-      auto boolVecTy = cc::StdvecType::get(i1Ty);
-      if (isa<quake::MeasurementsType>(arg.getType()))
-        arg = builder.create<quake::DiscriminateOp>(loc, boolVecTy, arg);
-      else if (arg.getType() != boolVecTy)
-        reportClangError(x, mangler,
-                         "`to_integer` requires measurements or "
-                         "std::vector<bool> argument");
       IRBuilder irBuilder(builder.getContext());
       if (failed(irBuilder.loadIntrinsic(module, cudaqConvertToInteger))) {
         reportClangError(x, mangler, "cannot load cudaqConvertToInteger");
         return false;
       }
       auto i64Ty = builder.getI64Type();
-      return pushValue(builder
-                           .create<func::CallOp>(loc, i64Ty,
-                                                 cudaqConvertToInteger,
-                                                 ValueRange{arg})
-                           .getResult(0));
-    }
-
-    // TODO: will be replaced by measure_vector::operator std::vector<bool>().
-    if (funcName == "to_bool_vector") {
-      auto arg = args[0];
-      assert(isa<quake::MeasurementsType>(arg.getType()) &&
-             "to_bool_vector requires measurements type argument");
-      auto i1Ty = builder.getI1Type();
-      arg = builder.create<quake::DiscriminateOp>(
-          loc, cc::StdvecType::get(i1Ty), arg);
-      return pushValue(arg);
+      return pushValue(
+          builder.create<func::CallOp>(loc, i64Ty, cudaqConvertToInteger, args)
+              .getResult(0));
     }
 
     if (funcName == "slice_vector") {
@@ -2627,10 +2530,6 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
       auto svec = popValue();
       if (isa<cc::PointerType>(svec.getType()))
         svec = builder.create<cc::LoadOp>(loc, svec);
-      if (isa<quake::MeasurementsType>(svec.getType())) {
-        auto getMeas = builder.create<quake::GetMeasureOp>(loc, svec, indexVar);
-        return replaceTOSValue(getMeas);
-      }
       if (!isa<cc::StdvecType>(svec.getType())) {
         TODO_x(loc, x, mangler, "vector dereference");
         return false;
@@ -3352,20 +3251,6 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
     return pushValue(builder.create<cc::LoadOp>(loc, copyObj));
   }
 
-  // For `measure_result`, the implicit "this" value is the `!quake.measure`
-  // SSA value; forward it unchanged.
-  // Note: Copy support is a temporary concession while
-  // `std::vector<measure_result>` exists (its `operator[]` returns by
-  // reference, forcing copies). Once replaced, it becomes move-only.
-  if ((ctor->isCopyConstructor() || ctor->isMoveConstructor()) &&
-      isInClassInNamespace(ctor, "measure_result", "cudaq")) {
-    assert(x->getNumArgs() == 1);
-    auto src = popValue();
-    assert(isa<quake::MeasureType>(src.getType()) &&
-           "`measure_result` copy/move source must be `!quake.measure`");
-    return pushValue(src);
-  }
-
   // TODO: remove this when we can handle ctors more generally.
   if (!ctor->isDefaultConstructor()) {
     LLVM_DEBUG(llvm::dbgs() << ctorName << " - unhandled ctor:\n"; x->dump());
diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp
index 23ee12901f5..54bd9ca50ec 100644
--- a/lib/Frontend/nvqpp/ConvertStmt.cpp
+++ b/lib/Frontend/nvqpp/ConvertStmt.cpp
@@ -263,46 +263,6 @@ bool QuakeBridgeVisitor::TraverseCXXForRangeStmt(clang::CXXForRangeStmt *x,
     auto idxIters = builder.create<cudaq::cc::CastOp>(
         loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
     opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
-  } else if (auto measTy =
-                 dyn_cast<quake::MeasurementsType>(buffer.getType())) {
-    Value iters;
-    if (measTy.hasSpecifiedSize()) {
-      iters =
-          builder.create<arith::ConstantIntOp>(loc, measTy.getSize(), i64Ty);
-    } else if (auto measIface = dyn_cast_or_null<quake::MeasurementInterface>(
-                   buffer.getDefiningOp())) {
-      // Derive the iteration count from the measurement op's qubit targets.
-      for (auto target : measIface.getTargets()) {
-        Value count;
-        if (auto veqTy = dyn_cast<quake::VeqType>(target.getType())) {
-          if (veqTy.hasSpecifiedSize())
-            count = builder.create<arith::ConstantIntOp>(loc, veqTy.getSize(),
-                                                         i64Ty);
-          else
-            count = builder.create<quake::VeqSizeOp>(loc, i64Ty, target);
-        } else {
-          count = builder.create<arith::ConstantIntOp>(loc, 1, i64Ty);
-        }
-        iters =
-            iters ? builder.create<arith::AddIOp>(loc, iters, count).getResult()
-                  : count;
-      }
-    } else {
-      iters = builder.create<quake::MeasurementsSizeOp>(loc, i64Ty, buffer);
-    }
-    auto bodyBuilder = [&](OpBuilder &builder, Location loc, Region &region,
-                           Block &block) {
-      OpBuilder::InsertionGuard guard(builder);
-      builder.setInsertionPointToStart(&block);
-      Value index = block.getArgument(0);
-      Value measure = builder.create<quake::GetMeasureOp>(loc, buffer, index);
-      symbolTable.insert(loopVar->getName(), measure);
-      if (!TraverseStmt(static_cast<clang::Stmt *>(body)))
-        result = false;
-    };
-    auto idxIters = builder.create<cudaq::cc::CastOp>(
-        loc, i64Ty, iters, cudaq::cc::CastOpMode::Unsigned);
-    opt::factory::createInvariantLoop(builder, loc, idxIters, bodyBuilder);
   } else {
     TODO_x(toLocation(x), x, mangler, "ranged for statement");
   }
@@ -376,22 +336,6 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
           result = builder.create<cc::CastOp>(loc, i1Ty, result);
       }
     }
-    // Relax sized measurements to unsized when the function expects unsized.
-    if (auto measTy = dyn_cast<quake::MeasurementsType>(result.getType())) {
-      auto *parentOp = builder.getBlock()->getParentOp();
-      auto funcOp = dyn_cast<func::FuncOp>(parentOp);
-      if (!funcOp)
-        funcOp = parentOp->getParentOfType<func::FuncOp>();
-      if (funcOp) {
-        auto fnTy = funcOp.getFunctionType();
-        if (fnTy.getNumResults() == 1)
-          if (auto fnResMeasTy =
-                  dyn_cast<quake::MeasurementsType>(fnTy.getResult(0)))
-            if (measTy != fnResMeasTy)
-              result =
-                  builder.create<quake::RelaxSizeOp>(loc, fnResMeasTy, result);
-      }
-    }
     if (auto vecTy = dyn_cast<cc::SpanLikeType>(resTy)) {
       // Returning vector data that was allocated on the stack is not valid.
       // Allocate space on the heap and make a copy of the vector instead. It
diff --git a/lib/Frontend/nvqpp/ConvertType.cpp b/lib/Frontend/nvqpp/ConvertType.cpp
index e151331aafd..c21ef8d56a9 100644
--- a/lib/Frontend/nvqpp/ConvertType.cpp
+++ b/lib/Frontend/nvqpp/ConvertType.cpp
@@ -124,21 +124,13 @@ static bool isFunctionCallable(Type t) {
   return false;
 }
 
-static bool isMeasureType(Type t) {
-  if (isa<quake::MeasureType, quake::MeasurementsType>(t))
-    return true;
-  if (auto vec = dyn_cast<cudaq::cc::SpanLikeType>(t))
-    return isMeasureType(vec.getElementType());
-  return false;
-}
-
 /// Return true if and only if \p t is a (simple) arithmetic type, an arithmetic
 /// sequence type (possibly dynamic in length), or a static product type of
 /// arithmetic types. Note that this means a product type with a dynamic
 /// sequence of arithmetic types is \em disallowed.
 static bool isKernelResultType(Type t) {
   return isArithmeticType(t) || isArithmeticSequenceType(t) ||
-         isStaticArithmeticProductType(t) || isMeasureType(t);
+         isStaticArithmeticProductType(t);
 }
 
 /// Return true if and only if \p t is a (simple) arithmetic type, an possibly
@@ -147,7 +139,7 @@ static bool isKernelResultType(Type t) {
 static bool isKernelArgumentType(Type t) {
   return isArithmeticType(t) || isComposedArithmeticType(t) ||
          quake::isQuantumReferenceType(t) || isKernelCallable(t) ||
-         isFunctionCallable(t) || isMeasureType(t) ||
+         isFunctionCallable(t) ||
          // TODO: move from pointers to a builtin string type.
          cudaq::isCharPointerType(t);
 }
@@ -457,8 +449,7 @@ bool QuakeBridgeVisitor::VisitLValueReferenceType(
     return pushType(cc::PointerType::get(builder.getContext()));
   auto eleTy = popType();
   if (isa<cc::CallableType, cc::IndirectCallableType, cc::SpanLikeType,
-          quake::VeqType, quake::RefType, quake::StruqType, quake::MeasureType,
-          quake::MeasurementsType>(eleTy))
+          quake::VeqType, quake::RefType, quake::StruqType>(eleTy))
     return pushType(eleTy);
   return pushType(cc::PointerType::get(eleTy));
 }
@@ -471,8 +462,7 @@ bool QuakeBridgeVisitor::VisitRValueReferenceType(
   // FIXME: LLVMStructType is promoted as a temporary workaround.
   if (isa<cc::ArrayType, cc::CallableType, cc::IndirectCallableType,
           cc::SpanLikeType, cc::StructType, quake::VeqType, quake::RefType,
-          quake::StruqType, quake::MeasureType, quake::MeasurementsType,
-          LLVM::LLVMStructType>(eleTy))
+          quake::StruqType, LLVM::LLVMStructType>(eleTy))
     return pushType(eleTy);
   return pushType(cc::PointerType::get(eleTy));
 }
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index 968035e37c0..c611b15a1f5 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -551,12 +551,10 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   func.func private @__quantum__rt__qubit_release(!qir_qubit)
 
   func.func private @__quantum__rt__array_create_1d(i32, i64) -> !qir_array
-  func.func private @__quantum__rt__result_array_create_1d(i64) -> !qir_array
   func.func private @__quantum__rt__array_concatenate(!qir_array, !qir_array) -> !qir_array
   func.func private @__quantum__rt__array_get_size_1d(!qir_array) -> i64
   func.func private @__quantum__rt__array_slice(!qir_array, i32, i64, i64, i64) -> !qir_array
   func.func private @__quantum__rt__array_get_element_ptr_1d(!qir_array, i64) -> !cc.ptr<!qir_qubit>
-  func.func private @__quantum__rt__result_array_get_element_ptr_1d(!qir_array, i64) -> !cc.ptr<!qir_result>
 
   func.func private @__quantum__qis__h__ctl(!qir_array, !qir_qubit)
   func.func private @__quantum__qis__x__ctl(!qir_array, !qir_qubit)
diff --git a/lib/Optimizer/Builder/Marshal.cpp b/lib/Optimizer/Builder/Marshal.cpp
index 03633b8d496..7c272eb3f12 100644
--- a/lib/Optimizer/Builder/Marshal.cpp
+++ b/lib/Optimizer/Builder/Marshal.cpp
@@ -761,10 +761,10 @@ void cudaq::opt::marshal::populateCallbackBuffer(
 
 bool cudaq::opt::marshal::hasLegalType(FunctionType funTy) {
   for (auto ty : funTy.getInputs())
-    if (quake::isQuakeType(ty))
+    if (quake::isQuantumType(ty))
       return false;
   for (auto ty : funTy.getResults())
-    if (quake::isQuakeType(ty))
+    if (quake::isQuantumType(ty))
       return false;
   return true;
 }
@@ -790,10 +790,6 @@ std::pair<bool, func::FuncOp> cudaq::opt::marshal::lookupHostEntryPointFunc(
     // No host entry point needed.
     return {false, func::FuncOp{}};
   }
-  // Device-only kernels (those with quantum types or `measure_result` in their
-  // signature) have no host-side entry point, so skip them.
-  if (!funcOp->hasAttr(cudaq::entryPointAttrName))
-    return {false, func::FuncOp{}};
   if (auto *decl = module.lookupSymbol(mangledEntryPointName))
     if (auto func = dyn_cast<func::FuncOp>(decl)) {
       func.eraseBody();
diff --git a/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp b/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
index 78585b13502..9cb7869cd66 100644
--- a/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToExecMgr.cpp
@@ -49,8 +49,7 @@ struct QuakeTypeConverter : public TypeConverter {
       return cudaq::cc::StructType::get(ty.getContext(), mems);
     });
     addConversion([](quake::MeasureType ty) {
-      auto i64Ty = IntegerType::get(ty.getContext(), 64);
-      return cudaq::cc::StructType::get(ty.getContext(), {i64Ty, i64Ty});
+      return IntegerType::get(ty.getContext(), 64);
     });
   }
 };
diff --git a/lib/Optimizer/CodeGen/ConvertToQIR.cpp b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
index a9770100005..686eb82d806 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIR.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
@@ -206,10 +206,8 @@ void cudaq::opt::initializeTypeConversions(LLVMTypeConverter &typeConverter) {
     return LLVM::LLVMStructType::getLiteral(type.getContext(), mems,
                                             /*packed=*/false);
   });
-  typeConverter.addConversion(
-      [](quake::MeasureType type) { return getResultType(type.getContext()); });
-  typeConverter.addConversion([](quake::MeasurementsType type) {
-    return getArrayType(type.getContext());
+  typeConverter.addConversion([](quake::MeasureType type) {
+    return IntegerType::get(type.getContext(), 1);
   });
   cudaq::opt::populateCCTypeConversions(&typeConverter);
 }
diff --git a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
index 8467cacc73c..0d08c1416e3 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
@@ -7,7 +7,6 @@
  ******************************************************************************/
 
 #include "CodeGenOps.h"
-#include "cudaq/Optimizer/Builder/Factory.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
@@ -149,31 +148,7 @@ struct QIRAPITypeConverter : public TypeConverter {
         [&](quake::CableType ty) { return getArrayType(ty.getContext()); });
     addConversion(
         [&](quake::MeasureType ty) { return getResultType(ty.getContext()); });
-    addConversion([&](quake::MeasurementsType ty) {
-      return getArrayType(ty.getContext());
-    });
     addConversion([&](quake::StruqType ty) { return convertStruqType(ty); });
-    addConversion([&](cudaq::cc::StdvecType ty) {
-      return cudaq::cc::StdvecType::get(ty.getContext(),
-                                        convertType(ty.getElementType()));
-    });
-    addConversion([&](cudaq::cc::ArrayType ty) {
-      auto newEleTy = convertType(ty.getElementType());
-      auto size = ty.getSize();
-      if (size)
-        return cudaq::cc::ArrayType::get(ty.getContext(), newEleTy, size);
-      return cudaq::cc::ArrayType::get(newEleTy);
-    });
-    addConversion([&](cudaq::cc::StructType ty) -> Type {
-      if (ty.getOpaque())
-        return ty;
-      SmallVector<Type> members;
-      for (auto memTy : ty.getMembers())
-        members.push_back(convertType(memTy));
-      return cudaq::cc::StructType::get(ty.getContext(), ty.getName(), members,
-                                        /*opaque=*/false, ty.getPacked(),
-                                        ty.getBitSize(), ty.getAlignment());
-    });
   }
 
   Type convertFunctionType(FunctionType ty) {
@@ -741,44 +716,6 @@ struct DeallocLikeErase : public OpConversionPattern<OP> {
 
 using DeallocOpErase = DeallocLikeErase<quake::DeallocOp>;
 using SinkOpErase = DeallocLikeErase<quake::SinkOp>;
-
-// Lower `quake.get_measure` to `result_array_get_element_ptr_1d`.
-struct GetMeasureOpRewrite : public OpConversionPattern<quake::GetMeasureOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::GetMeasureOp getMeas, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto loc = getMeas.getLoc();
-    auto i64Ty = rewriter.getI64Type();
-    Value index;
-    if (!adaptor.getIndex()) {
-      index =
-          rewriter.create<arith::ConstantIntOp>(loc, getMeas.getRawIndex(), 64);
-    } else {
-      index = adaptor.getIndex();
-      if (isa<IndexType>(index.getType())) {
-        index = rewriter.create<arith::IndexCastOp>(loc, i64Ty, index);
-      } else if (isa<IntegerType>(index.getType())) {
-        auto width = cast<IntegerType>(index.getType()).getWidth();
-        if (width < 64)
-          index = rewriter.create<cudaq::cc::CastOp>(
-              loc, i64Ty, index, cudaq::cc::CastOpMode::Unsigned);
-        else if (width > 64)
-          index = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, index);
-      }
-    }
-    auto resultTy =
-        getTypeConverter()->convertType(getMeas.getMeasure().getType());
-    auto ptrResultTy = cudaq::cc::PointerType::get(resultTy);
-    auto call = rewriter.create<func::CallOp>(
-        loc, TypeRange{ptrResultTy}, cudaq::opt::QIRResultArrayGetElementPtr1d,
-        ArrayRef<Value>{adaptor.getMeasurements(), index});
-    rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(getMeas, call.getResult(0));
-    return success();
-  }
-};
-
 struct DiscriminateOpRewrite
     : public OpConversionPattern<quake::DiscriminateOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -788,86 +725,9 @@ struct DiscriminateOpRewrite
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = disc.getLoc();
     Value m = adaptor.getMeasurement();
-
-    // If the result is a stdvec (indicating a `MeasurementsType` input), loop
-    // over the result array and read each result. NB: we check the result type
-    // rather than the operand type because the type converter has already
-    // remapped the operand from MeasurementsType to Array*.
-    if (isa<cudaq::cc::StdvecType>(disc.getResult().getType())) {
-      auto i1Ty = rewriter.getI1Type();
-      auto i1PtrTy = cudaq::cc::PointerType::get(i1Ty);
-      auto i64Ty = rewriter.getI64Type();
-      auto resultTy = cudaq::cg::getResultType(rewriter.getContext());
-      auto ptrResultTy = cudaq::cc::PointerType::get(resultTy);
-
-      auto stdvecResTy = cast<cudaq::cc::StdvecType>(
-          getTypeConverter()->convertType(disc.getResult().getType()));
-      auto elemTy = stdvecResTy.getElementType();
-      unsigned elemWidth = cast<IntegerType>(elemTy).getWidth();
-      Type bufElemTy =
-          elemWidth > 8 ? elemTy : static_cast<Type>(rewriter.getI8Type());
-
-      Value arraySize =
-          rewriter
-              .create<func::CallOp>(loc, i64Ty, cudaq::opt::QIRArrayGetSize,
-                                    ValueRange{m})
-              .getResult(0);
-      Value buff =
-          rewriter.create<cudaq::cc::AllocaOp>(loc, bufElemTy, arraySize);
-
-      cudaq::opt::factory::createInvariantLoop(
-          rewriter, loc, arraySize,
-          [&](OpBuilder &builder, Location loc, Region &, Block &block) {
-            Value iv = block.getArgument(0);
-            Value elemPtr = builder
-                                .create<func::CallOp>(
-                                    loc, ptrResultTy,
-                                    cudaq::opt::QIRResultArrayGetElementPtr1d,
-                                    ValueRange{m, iv})
-                                .getResult(0);
-            Value resultVal = builder.create<cudaq::cc::LoadOp>(loc, elemPtr);
-            Value bitPtr =
-                builder.create<cudaq::cc::CastOp>(loc, i1PtrTy, resultVal);
-            Value bit = builder.create<cudaq::cc::LoadOp>(loc, bitPtr);
-            Value addr = builder.create<cudaq::cc::ComputePtrOp>(
-                loc, cudaq::cc::PointerType::get(bufElemTy), buff, iv);
-            Value stored = (i1Ty != bufElemTy)
-                               ? builder
-                                     .create<cudaq::cc::CastOp>(
-                                         loc, bufElemTy, bit,
-                                         cudaq::cc::CastOpMode::Unsigned)
-                                     .getResult()
-                               : static_cast<Value>(bit);
-            builder.create<cudaq::cc::StoreOp>(loc, stored, addr);
-          });
-
-      auto ptrArrElemTy =
-          cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(elemTy));
-      auto buffCast =
-          rewriter.create<cudaq::cc::CastOp>(loc, ptrArrElemTy, buff);
-      rewriter.replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(disc, stdvecResTy,
-                                                           buffCast, arraySize);
-      return success();
-    }
-
-    auto i1Ty = rewriter.getI1Type();
-    auto i1PtrTy = cudaq::cc::PointerType::get(i1Ty);
-    auto origResTy = disc.getResult().getType();
-    Value loaded;
-    if (auto intTy = dyn_cast<IntegerType>(origResTy);
-        intTy && intTy.getWidth() > 1) {
-      // For wider-than-i1 types: use byte-addressable i8* load, then cc.cast
-      // to truncate to the target width.
-      auto i8Ty = rewriter.getI8Type();
-      auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
-      auto bytePtr = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, m);
-      Value byteVal = rewriter.create<cudaq::cc::LoadOp>(loc, bytePtr);
-      loaded = rewriter.create<cudaq::cc::CastOp>(loc, origResTy, byteVal);
-    } else {
-      auto ptrCast = rewriter.create<cudaq::cc::CastOp>(loc, i1PtrTy, m);
-      loaded = rewriter.create<cudaq::cc::LoadOp>(loc, ptrCast);
-    }
-    rewriter.replaceOp(disc, loaded);
+    auto i1PtrTy = cudaq::cc::PointerType::get(rewriter.getI1Type());
+    auto cast = rewriter.create<cudaq::cc::CastOp>(loc, i1PtrTy, m);
+    rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, cast);
     return success();
   }
 };
@@ -883,45 +743,26 @@ struct DiscriminateOpToCallRewrite
   LogicalResult
   matchAndRewrite(quake::DiscriminateOp disc, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // This pattern handles single-qubit MeasureType only.
-    if (isa<cudaq::cc::StdvecType>(disc.getResult().getType()))
-      return failure();
-    auto loc = disc.getLoc();
-    auto i1Ty = rewriter.getI1Type();
-    Value loaded;
     if constexpr (M::discriminateToClassical) {
-      StringRef readFn = M::qirVersion == QirVersion::version_1_0
-                             ? cudaq::opt::qir1_0::ReadResult
-                             : cudaq::opt::qir0_1::ReadResultBody;
-      auto call = rewriter.create<func::CallOp>(loc, i1Ty, readFn,
-                                                adaptor.getOperands());
-      loaded = call.getResult(0);
+      if constexpr (M::qirVersion == QirVersion::version_1_0) {
+        rewriter.replaceOpWithNewOp<func::CallOp>(
+            disc, rewriter.getI1Type(), cudaq::opt::qir1_0::ReadResult,
+            adaptor.getOperands());
+      } else {
+        rewriter.replaceOpWithNewOp<func::CallOp>(
+            disc, rewriter.getI1Type(), cudaq::opt::qir0_1::ReadResultBody,
+            adaptor.getOperands());
+      }
     } else {
+      auto loc = disc.getLoc();
       // NB: the double cast here is to avoid folding the pointer casts.
       auto i64Ty = rewriter.getI64Type();
       auto unu =
           rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, adaptor.getOperands());
-      auto origResTy = disc.getResult().getType();
-      if (auto intTy = dyn_cast<IntegerType>(origResTy);
-          intTy && intTy.getWidth() > 1) {
-        auto i8Ty = rewriter.getI8Type();
-        auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
-        auto du = rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, unu);
-        Value byteVal = rewriter.create<cudaq::cc::LoadOp>(loc, du);
-        loaded = rewriter.create<cudaq::cc::CastOp>(loc, origResTy, byteVal);
-      } else {
-        auto ptrI1Ty = cudaq::cc::PointerType::get(i1Ty);
-        auto du = rewriter.create<cudaq::cc::CastOp>(loc, ptrI1Ty, unu);
-        loaded = rewriter.create<cudaq::cc::LoadOp>(loc, du);
-      }
+      auto ptrI1Ty = cudaq::cc::PointerType::get(rewriter.getI1Type());
+      auto du = rewriter.create<cudaq::cc::CastOp>(loc, ptrI1Ty, unu);
+      rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, du);
     }
-    auto origResTy = disc.getResult().getType();
-    if constexpr (M::discriminateToClassical) {
-      if (auto intTy = dyn_cast<IntegerType>(origResTy);
-          intTy && intTy.getWidth() > 1)
-        loaded = rewriter.create<arith::ExtUIOp>(loc, origResTy, loaded);
-    }
-    rewriter.replaceOp(disc, loaded);
     return success();
   }
 
@@ -950,7 +791,7 @@ struct ExtractRefOpRewrite : public OpConversionPattern<quake::ExtractRefOp> {
           loc, extract.getConstantIndex(), 64);
     } else {
       index = adaptor.getIndex();
-      if (isa<IntegerType>(index.getType())) {
+      if (index.getType().isIntOrFloat()) {
         if (cast<IntegerType>(index.getType()).getWidth() < 64)
           index = rewriter.create<cudaq::cc::CastOp>(
               loc, i64Ty, index, cudaq::cc::CastOpMode::Unsigned);
@@ -1005,20 +846,6 @@ struct VeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
   }
 };
 
-struct MeasurementsSizeOpRewrite
-    : public OpConversionPattern<quake::MeasurementsSizeOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::MeasurementsSizeOp msize, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<func::CallOp>(msize, TypeRange{msize.getType()},
-                                              cudaq::opt::QIRArrayGetSize,
-                                              adaptor.getOperands());
-    return success();
-  }
-};
-
 struct MakeStruqOpRewrite : public OpConversionPattern<quake::MakeStruqOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -1473,12 +1300,9 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
     SmallVector<Value> args{adaptor.getTargets().begin(),
                             adaptor.getTargets().end()};
     auto functionName = M::getQIRMeasure();
-    bool isMultiQubit = isa<quake::MeasurementsType>(mz.getMeasOut().getType());
+
     // Are we using the measurement that returns a result?
     if constexpr (M::mzReturnsResultType) {
-      if (isMultiQubit)
-        return rewriteMultiQubitMeasurement(mz, adaptor, rewriter, loc,
-                                            regNameAttr);
       // Yes, the measurement results the result, so we can use a
       // straightforward codegen pattern. Use either the mz or the
       // mz_to_register call (with the name as an extra argument) and forward
@@ -1542,113 +1366,6 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
     }
     return success();
   }
-
-private:
-  LogicalResult
-  rewriteMultiQubitMeasurement(quake::MzOp mz, OpAdaptor adaptor,
-                               ConversionPatternRewriter &rewriter,
-                               Location loc, StringAttr regNameAttr) const {
-    auto *ctx = rewriter.getContext();
-    auto i64Ty = rewriter.getI64Type();
-    auto resultTy = M::getResultType(ctx);
-    auto arrayTy = M::getArrayType(ctx);
-    auto qubitTy = M::getQubitType(ctx);
-    auto ptrQubitTy = cudaq::cc::PointerType::get(qubitTy);
-    auto ptrResultTy = cudaq::cc::PointerType::get(resultTy);
-
-    // Compute total number of qubits across all targets, caching veq sizes.
-    SmallVector<Value> veqSizes;
-    Value totalQubits = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    for (auto [origTarget, convTarget] :
-         llvm::zip(mz.getTargets(), adaptor.getTargets())) {
-      if (isa<quake::RefType>(origTarget.getType())) {
-        Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-        totalQubits = rewriter.create<arith::AddIOp>(loc, totalQubits, one);
-        veqSizes.push_back(Value{});
-      } else {
-        Value sz =
-            rewriter
-                .create<func::CallOp>(loc, i64Ty, cudaq::opt::QIRArrayGetSize,
-                                      ValueRange{convTarget})
-                .getResult(0);
-        totalQubits = rewriter.create<arith::AddIOp>(loc, totalQubits, sz);
-        veqSizes.push_back(sz);
-      }
-    }
-
-    // Allocate the result array.
-    Value resultArray = rewriter
-                            .create<func::CallOp>(
-                                loc, arrayTy, cudaq::opt::QIRResultArrayCreate,
-                                ValueRange{totalQubits})
-                            .getResult(0);
-
-    auto functionName = M::getQIRMeasure();
-    Value cstringGlobal;
-    if (mz->getAttr(cudaq::opt::MzAssignedNameAttrName)) {
-      functionName = cudaq::opt::QIRMeasureToRegister;
-      cstringGlobal =
-          createGlobalCString(mz, loc, rewriter, regNameAttr.getValue());
-    }
-
-    auto getResultSlot = [&](OpBuilder &builder, Location loc, Value array,
-                             Value index) -> Value {
-      return builder
-          .create<func::CallOp>(loc, ptrResultTy,
-                                cudaq::opt::QIRResultArrayGetElementPtr1d,
-                                ValueRange{array, index})
-          .getResult(0);
-    };
-
-    // Iterate over targets, measure each qubit, store Result* in the array.
-    Value offset = rewriter.create<arith::ConstantIntOp>(loc, 0, 64);
-    Value one = rewriter.create<arith::ConstantIntOp>(loc, 1, 64);
-    unsigned sizeIdx = 0;
-    for (auto [origTarget, convTarget] :
-         llvm::zip(mz.getTargets(), adaptor.getTargets())) {
-      if (isa<quake::RefType>(origTarget.getType())) {
-        SmallVector<Value> mzArgs{convTarget};
-        if (cstringGlobal)
-          mzArgs.push_back(cstringGlobal);
-        Value result =
-            rewriter.create<func::CallOp>(loc, resultTy, functionName, mzArgs)
-                .getResult(0);
-        Value slot = getResultSlot(rewriter, loc, resultArray, offset);
-        rewriter.create<cudaq::cc::StoreOp>(loc, result, slot);
-        offset = rewriter.create<arith::AddIOp>(loc, offset, one);
-        ++sizeIdx;
-      } else {
-        Value veqSize = veqSizes[sizeIdx++];
-        auto savedOffset = offset;
-        cudaq::opt::factory::createInvariantLoop(
-            rewriter, loc, veqSize,
-            [&](OpBuilder &builder, Location loc, Region &, Block &block) {
-              Value iv = block.getArgument(0);
-              Value qubitPtr =
-                  builder
-                      .create<func::CallOp>(loc, ptrQubitTy,
-                                            cudaq::opt::QIRArrayGetElementPtr1d,
-                                            ValueRange{convTarget, iv})
-                      .getResult(0);
-              Value qubit = builder.create<cudaq::cc::LoadOp>(loc, qubitPtr);
-              SmallVector<Value> mzArgs{qubit};
-              if (cstringGlobal)
-                mzArgs.push_back(cstringGlobal);
-              Value result =
-                  builder
-                      .create<func::CallOp>(loc, resultTy, functionName, mzArgs)
-                      .getResult(0);
-              Value idx = builder.create<arith::AddIOp>(loc, savedOffset, iv);
-              Value slot = getResultSlot(builder, loc, resultArray, idx);
-              builder.create<cudaq::cc::StoreOp>(loc, result, slot);
-            });
-        offset = rewriter.create<arith::AddIOp>(loc, offset, veqSize);
-      }
-    }
-
-    rewriter.replaceOp(mz, resultArray);
-    return success();
-  }
 };
 
 template <typename M>
@@ -2154,23 +1871,20 @@ struct InstantiateCallablePattern
   }
 };
 
-template <typename OP>
-struct ZeroResultOpPattern : public OpConversionPattern<OP> {
-  using Base = OpConversionPattern<OP>;
+struct StoreOpPattern : public OpConversionPattern<cudaq::cc::StoreOp> {
+  using Base = OpConversionPattern;
   using Base::Base;
+  using Base::getTypeConverter;
 
   LogicalResult
-  matchAndRewrite(OP op, typename Base::OpAdaptor adaptor,
+  matchAndRewrite(cudaq::cc::StoreOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<OP>(op, TypeRange{}, adaptor.getOperands(),
-                                    op->getAttrs());
+    rewriter.replaceOpWithNewOp<cudaq::cc::StoreOp>(
+        op, TypeRange{}, adaptor.getOperands(), op->getAttrs());
     return success();
   }
 };
 
-using StoreOpPattern = ZeroResultOpPattern<cudaq::cc::StoreOp>;
-using LogOutputOpPattern = ZeroResultOpPattern<cudaq::cc::LogOutputOp>;
-
 template <typename CALLOP>
 struct CallOpInterfacePattern : public OpConversionPattern<CALLOP> {
   using Base = OpConversionPattern<CALLOP>;
@@ -2245,24 +1959,22 @@ struct CallableClosurePattern
 static void commonClassicalHandlingPatterns(RewritePatternSet &patterns,
                                             TypeConverter &typeConverter,
                                             MLIRContext *ctx) {
-  patterns
-      .insert<AllocaOpPattern, BranchOpPattern, CallableClosurePattern,
-              CallableFuncPattern, CallCallableOpPattern,
-              CallIndirectCallableOpPattern, CallIndirectOpPattern,
-              CallOpPattern, CallNoInlineOpPattern, CallVarargOpPattern,
-              CastOpPattern, CondBranchOpPattern, CreateLambdaPattern,
-              FuncConstantPattern, FuncSignaturePattern, FuncToPtrPattern,
-              InstantiateCallablePattern, LoadOpPattern, LogOutputOpPattern,
-              PoisonOpPattern, SelectOpPattern, StoreOpPattern, UndefOpPattern>(
-          typeConverter, ctx);
+  patterns.insert<AllocaOpPattern, BranchOpPattern, CallableClosurePattern,
+                  CallableFuncPattern, CallCallableOpPattern,
+                  CallIndirectCallableOpPattern, CallIndirectOpPattern,
+                  CallOpPattern, CallNoInlineOpPattern, CallVarargOpPattern,
+                  CastOpPattern, CondBranchOpPattern, CreateLambdaPattern,
+                  FuncConstantPattern, FuncSignaturePattern, FuncToPtrPattern,
+                  InstantiateCallablePattern, LoadOpPattern, PoisonOpPattern,
+                  SelectOpPattern, StoreOpPattern, UndefOpPattern>(
+      typeConverter, ctx);
 }
 
 static void commonQuakeHandlingPatterns(RewritePatternSet &patterns,
                                         TypeConverter &typeConverter,
                                         MLIRContext *ctx) {
-  patterns.insert<ApplyOpTrap, CallByRefOpRewrite, GetMeasureOpRewrite,
-                  GetMemberOpRewrite, MakeStruqOpRewrite,
-                  MeasurementsSizeOpRewrite, ReturnOpPattern, RelaxSizeOpErase,
+  patterns.insert<ApplyOpTrap, CallByRefOpRewrite, GetMemberOpRewrite,
+                  MakeStruqOpRewrite, ReturnOpPattern, RelaxSizeOpErase,
                   UnwrapOpErase, VeqSizeOpRewrite, WrapOpErase>(typeConverter,
                                                                 ctx);
 }
@@ -2530,8 +2242,7 @@ struct QuakeToQIRAPIPass
         cudaq::cc::NoInlineCallOp, cudaq::cc::VarargCallOp,
         cudaq::cc::CallCallableOp, cudaq::cc::CallIndirectCallableOp,
         cudaq::cc::CastOp, cudaq::cc::FuncToPtrOp, cudaq::cc::StoreOp,
-        cudaq::cc::LoadOp, cudaq::cc::ComputePtrOp, cudaq::cc::StdvecInitOp,
-        cudaq::cc::StdvecDataOp, cudaq::cc::LogOutputOp>([&](Operation *op) {
+        cudaq::cc::LoadOp>([&](Operation *op) {
       for (auto opnd : op->getOperands())
         if (hasQuakeType(opnd.getType()))
           return false;
@@ -2549,16 +2260,6 @@ struct QuakeToQIRAPIPass
   static bool hasQuakeType(Type ty) {
     if (auto pty = dyn_cast<cudaq::cc::PointerType>(ty))
       return hasQuakeType(pty.getElementType());
-    if (auto aty = dyn_cast<cudaq::cc::ArrayType>(ty))
-      return hasQuakeType(aty.getElementType());
-    if (auto sty = dyn_cast<cudaq::cc::StdvecType>(ty))
-      return hasQuakeType(sty.getElementType());
-    if (auto sty = dyn_cast<cudaq::cc::StructType>(ty)) {
-      for (auto memTy : sty.getMembers())
-        if (hasQuakeType(memTy))
-          return true;
-      return false;
-    }
     if (auto cty = dyn_cast<cudaq::cc::CallableType>(ty))
       return hasQuakeType(cty.getSignature());
     if (auto cty = dyn_cast<cudaq::cc::IndirectCallableType>(ty))
diff --git a/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp b/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
index 35f4380c4e0..8dbeac4659c 100644
--- a/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToExecMgr.cpp
@@ -450,25 +450,6 @@ class VeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
   }
 };
 
-class MeasurementsSizeOpRewrite
-    : public OpConversionPattern<quake::MeasurementsSizeOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::MeasurementsSizeOp msize, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto loc = msize->getLoc();
-    auto i64Ty = rewriter.getI64Type();
-    auto ptrI64Ty = cudaq::cc::PointerType::get(i64Ty);
-    auto sizeptr = rewriter.create<cudaq::cc::ComputePtrOp>(
-        loc, ptrI64Ty, adaptor.getMeasurements(),
-        ArrayRef<cudaq::cc::ComputePtrArg>{1});
-    rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(msize, sizeptr);
-    return success();
-  }
-};
-
 } // namespace
 
 void cudaq::opt::populateQuakeToCCPatterns(TypeConverter &converter,
@@ -476,9 +457,8 @@ void cudaq::opt::populateQuakeToCCPatterns(TypeConverter &converter,
   auto *context = patterns.getContext();
   patterns.insert<AllocaOpRewrite, ConcatOpRewrite, DeallocOpRewrite,
                   DiscriminateOpRewrite, ExtractRefOpRewrite, VeqSizeOpRewrite,
-                  MeasurementsSizeOpRewrite, MzOpRewrite, ResetRewrite,
-                  SubveqOpRewrite, GenericRewrite<quake::HOp>,
-                  GenericRewrite<quake::PhasedRxOp>,
+                  MzOpRewrite, ResetRewrite, SubveqOpRewrite,
+                  GenericRewrite<quake::HOp>, GenericRewrite<quake::PhasedRxOp>,
                   GenericRewrite<quake::R1Op>, GenericRewrite<quake::RxOp>,
                   GenericRewrite<quake::RyOp>, GenericRewrite<quake::RzOp>,
                   GenericRewrite<quake::SOp>, GenericRewrite<quake::SwapOp>,
diff --git a/lib/Optimizer/CodeGen/QuakeToLLVM.cpp b/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
index f6dbd0206c7..32d845d2b6a 100644
--- a/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/QuakeToLLVM.cpp
@@ -1144,7 +1144,11 @@ class MeasureRewrite : public ConvertOpToLLVMPattern<OP> {
         loc, cudaq::opt::getResultType(context), symbolRef, ValueRange{args});
     if (regName)
       callOp->setAttr("registerName", regName);
-    rewriter.replaceOp(measure, callOp.getResult());
+    auto i1Ty = rewriter.getI1Type();
+    auto i1PtrTy = LLVM::LLVMPointerType::get(i1Ty);
+    auto cast =
+        rewriter.create<LLVM::BitcastOp>(loc, i1PtrTy, callOp.getResult());
+    rewriter.replaceOpWithNewOp<LLVM::LoadOp>(measure, i1Ty, cast);
 
     return success();
   }
@@ -1174,28 +1178,6 @@ class GetVeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
   }
 };
 
-class GetMeasurementsSizeOpRewrite
-    : public OpConversionPattern<quake::MeasurementsSizeOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(quake::MeasurementsSizeOp msize, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto parentModule = msize->getParentOfType<ModuleOp>();
-    auto context = parentModule->getContext();
-    auto qFunctionName = cudaq::opt::QIRArrayGetSize;
-
-    auto symbolRef = cudaq::opt::factory::createLLVMFunctionSymbol(
-        qFunctionName, rewriter.getI64Type(),
-        {cudaq::opt::getArrayType(context)}, parentModule);
-
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(msize, rewriter.getI64Type(),
-                                              symbolRef, adaptor.getOperands());
-    return success();
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // Other conversion patterns.
 //===----------------------------------------------------------------------===//
@@ -1429,8 +1411,9 @@ void cudaq::opt::populateQuakeToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                              unsigned &measureCounter) {
   auto *context = patterns.getContext();
   cudaq::opt::populateQuakeToCCPrepPatterns(patterns);
-  patterns.insert<GetMeasurementsSizeOpRewrite, GetVeqSizeOpRewrite,
-                  RemoveRelaxSizeRewrite, ReturnBitRewrite>(context);
+  patterns
+      .insert<GetVeqSizeOpRewrite, RemoveRelaxSizeRewrite, ReturnBitRewrite>(
+          context);
   patterns
       .insert<AllocaOpRewrite, ConcatOpRewrite, CustomUnitaryOpRewrite,
               DeallocOpRewrite, DiscriminateOpPattern, ExtractQubitOpRewrite,
diff --git a/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp b/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
index 1051fa43183..8f7370a9947 100644
--- a/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
+++ b/lib/Optimizer/CodeGen/WireSetsToProfileQIR.cpp
@@ -70,9 +70,6 @@ struct QuakeTypeConverter : public TypeConverter {
     addConversion([](quake::MeasureType ty) {
       return cudaq::opt::getResultType(ty.getContext());
     });
-    addConversion([](quake::MeasurementsType ty) {
-      return cudaq::opt::getArrayType(ty.getContext());
-    });
   }
 };
 } // namespace
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index d024918cc32..04172cf6bba 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -77,10 +77,6 @@ Value cudaq::cc::getByteSizeOfType(OpBuilder &builder, Location loc, Type ty,
                 // we're assuming pointers are 64 bits.
                 return {8};
               })
-          .Case([](quake::MeasureType) -> std::optional<std::int32_t> {
-            // Size of `measure_result` {value, unique_id} = 16 bytes
-            return {16};
-          })
           .Default({});
 
   if (rawSize)
diff --git a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
index e6d4bddb291..a2d45bc0d21 100644
--- a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
+++ b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
@@ -100,28 +100,6 @@ struct ForwardConstantVeqSizePattern
   }
 };
 
-// %4 = quake.measurements_size %3 : (!quake.measurements<10>) -> i64
-// ─────────────────────────────────────────────────────────────────
-// %4 = constant 10 : i64
-struct ForwardConstantMeasurementsSizePattern
-    : public OpRewritePattern<quake::MeasurementsSizeOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::MeasurementsSizeOp msSize,
-                                PatternRewriter &rewriter) const override {
-    auto msTy =
-        dyn_cast<quake::MeasurementsType>(msSize.getMeasurements().getType());
-    if (!msTy)
-      return failure();
-    if (!msTy.hasSpecifiedSize())
-      return failure();
-    auto resTy = msSize.getType();
-    rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(msSize, msTy.getSize(),
-                                                      resTy);
-    return success();
-  }
-};
-
 // %2 = constant 10 : i32
 // %3 = quake.alloca !quake.veq<?>[%2 : i32]
 // ─────────────────────────────────────────
@@ -737,98 +715,6 @@ struct MergeRotationPattern : public OpRewritePattern<OP> {
   }
 };
 
-// %0 = quake.alloca !quake.veq<2>
-// %1 = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<?>
-// ────────────────────────────────────────────────────────────
-// %0 = quake.alloca !quake.veq<2>
-// %1 = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
-template <typename MeasOp>
-struct FuseSizeToMeasurementPattern : public OpRewritePattern<MeasOp> {
-  using OpRewritePattern<MeasOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MeasOp measOp,
-                                PatternRewriter &rewriter) const override {
-    auto measTy =
-        dyn_cast<quake::MeasurementsType>(measOp.getMeasOut().getType());
-    if (!measTy || measTy.hasSpecifiedSize())
-      return failure();
-
-    std::size_t totalSize = 0;
-    for (auto target : measOp.getTargets()) {
-      if (quake::isConstantQuantumRefType(target.getType())) {
-        totalSize += quake::getAllocationSize(target.getType());
-        continue;
-      }
-      return failure();
-    }
-    if (totalSize == 0)
-      return failure();
-
-    auto newMeasTy =
-        quake::MeasurementsType::get(rewriter.getContext(), totalSize);
-
-    // If any user expects unsized measurements (return-like ops where the
-    // function returns unsized, or call ops where the callee parameter is
-    // unsized), insert a quake.relax_size to bridge the sized-to-unsized gap.
-    auto needsCastForUser = [&](OpOperand &use) -> bool {
-      auto *user = use.getOwner();
-      if (isa<cudaq::cc::ReturnOp, cudaq::cc::UnwindReturnOp>(user)) {
-        if (auto funcOp = user->getParentOfType<func::FuncOp>())
-          if (funcOp.getFunctionType().getNumResults() == 1)
-            if (auto fnResMeasTy = dyn_cast<quake::MeasurementsType>(
-                    funcOp.getFunctionType().getResult(0)))
-              return !fnResMeasTy.hasSpecifiedSize();
-        return false;
-      }
-      auto checkCalleeArgType = [&](mlir::FunctionType calleeType,
-                                    unsigned argIdx) -> bool {
-        if (argIdx < calleeType.getNumInputs())
-          if (auto paramMeasTy = dyn_cast<quake::MeasurementsType>(
-                  calleeType.getInput(argIdx)))
-            return !paramMeasTy.hasSpecifiedSize();
-        return false;
-      };
-      if (auto callOp = dyn_cast<func::CallOp>(user))
-        return checkCalleeArgType(callOp.getCalleeType(),
-                                  use.getOperandNumber());
-      if (auto callOp = dyn_cast<func::CallIndirectOp>(user)) {
-        unsigned opIdx = use.getOperandNumber();
-        if (opIdx == 0)
-          return false; // operand 0 is the callee value itself
-        auto calleeFnTy = cast<mlir::FunctionType>(
-            callOp.getCallee().getType());
-        return checkCalleeArgType(calleeFnTy, opIdx - 1);
-      }
-      return false;
-    };
-
-    SmallVector<Type> resultTypes;
-    resultTypes.push_back(newMeasTy);
-    for (unsigned i = 1; i < measOp->getNumResults(); ++i)
-      resultTypes.push_back(measOp->getResult(i).getType());
-
-    auto oldAttrs = measOp->getAttrs();
-    auto newOp = rewriter.replaceOpWithNewOp<MeasOp>(measOp,
-                                                     TypeRange{resultTypes},
-                                                     measOp.getTargets(),
-                                                     measOp.getRegisterNameAttr());
-    for (auto &attr : oldAttrs)
-      if (!newOp->getAttr(attr.getName()))
-        newOp->setAttr(attr.getName(), attr.getValue());
-
-    for (auto &use :
-         llvm::make_early_inc_range(newOp.getMeasOut().getUses())) {
-      if (needsCastForUser(use)) {
-        rewriter.setInsertionPoint(use.getOwner());
-        auto relax = rewriter.create<quake::RelaxSizeOp>(
-            use.getOwner()->getLoc(), measTy, newOp.getMeasOut());
-        use.set(relax);
-      }
-    }
-    return success();
-  }
-};
-
 // Forward the argument to a relax_size to the users for all users that are
 // quake operations. All quake ops that take a sized veq argument are
 // polymorphic on all veq types. If the op is not a quake op, then maintain
diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index e8c8228f6fa..ac459be3e4e 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -625,30 +625,6 @@ void quake::GetMemberOp::getCanonicalizationPatterns(
   patterns.add<BypassMakeStruq>(context);
 }
 
-//===----------------------------------------------------------------------===//
-// GetMeasureOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult quake::GetMeasureOp::verify() {
-  if (getIndex()) {
-    if (getRawIndex() != kDynamicIndex)
-      return emitOpError(
-          "must not have both a constant index and an index argument.");
-  } else {
-    if (getRawIndex() == kDynamicIndex) {
-      return emitOpError("invalid constant index value");
-    } else {
-      auto msSize = getMeasurements().getType().getSize();
-      if (getMeasurements().getType().hasSpecifiedSize() &&
-          getRawIndex() >= msSize)
-        return emitOpError("invalid index [" + std::to_string(getRawIndex()) +
-                           "] because >= size [" + std::to_string(msSize) +
-                           "]");
-    }
-  }
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // InitializeStateOp
 //===----------------------------------------------------------------------===//
@@ -702,19 +678,8 @@ LogicalResult quake::MakeStruqOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult quake::RelaxSizeOp::verify() {
-  auto inTy = getInputVec().getType();
-  auto resTy = getType();
-  if (auto veqTy = dyn_cast<quake::VeqType>(resTy)) {
-    if (veqTy.hasSpecifiedSize())
-      return emitOpError("result veq type must not specify a size");
-    if (!isa<quake::VeqType>(inTy))
-      return emitOpError("input and result must both be veq types");
-  } else if (auto measTy = dyn_cast<quake::MeasurementsType>(resTy)) {
-    if (measTy.hasSpecifiedSize())
-      return emitOpError("result measurements type must not specify a size");
-    if (!isa<quake::MeasurementsType>(inTy))
-      return emitOpError("input and result must both be measurements types");
-  }
+  if (cast<quake::VeqType>(getType()).hasSpecifiedSize())
+    emitOpError("return veq type must not specify a size");
   return success();
 }
 
@@ -767,15 +732,6 @@ void quake::VeqSizeOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
       context);
 }
 
-//===----------------------------------------------------------------------===//
-// MeasurementsSizeOp
-//===----------------------------------------------------------------------===//
-
-void quake::MeasurementsSizeOp::getCanonicalizationPatterns(
-    RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<ForwardConstantMeasurementsSizePattern>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // WrapOp
 //===----------------------------------------------------------------------===//
@@ -868,12 +824,12 @@ LogicalResult verifyMeasurements(MEAS op, TypeRange targetsType,
                                  const Type bitsType) {
   if (failed(verifyWireResultsAreLinear(op)))
     return failure();
-  bool mustBeCollection =
+  bool mustBeStdvec =
       targetsType.size() > 1 ||
       (targetsType.size() == 1 && isa<quake::VeqType>(targetsType[0]));
-  if (mustBeCollection) {
-    if (!isa<quake::MeasurementsType>(op.getMeasOut().getType()))
-      return op.emitOpError("must return `!quake.measurements`, when "
+  if (mustBeStdvec) {
+    if (!isa<cudaq::cc::StdvecType>(op.getMeasOut().getType()))
+      return op.emitOpError("must return `!cc.stdvec<!quake.measure>`, when "
                             "measuring a qreg, a series of qubits, or both");
   } else {
     if (!isa<quake::MeasureType>(op.getMeasOut().getType()))
@@ -901,34 +857,19 @@ LogicalResult quake::MzOp::verify() {
                             getMeasOut().getType());
 }
 
-void quake::MxOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
-                                              MLIRContext *context) {
-  patterns.add<FuseSizeToMeasurementPattern<quake::MxOp>>(context);
-}
-
-void quake::MyOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
-                                              MLIRContext *context) {
-  patterns.add<FuseSizeToMeasurementPattern<quake::MyOp>>(context);
-}
-
-void quake::MzOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
-                                              MLIRContext *context) {
-  patterns.add<FuseSizeToMeasurementPattern<quake::MzOp>>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // Discriminate
 //===----------------------------------------------------------------------===//
 
 LogicalResult quake::DiscriminateOp::verify() {
-  if (isa<quake::MeasurementsType>(getMeasurement().getType())) {
+  if (isa<cudaq::cc::StdvecType>(getMeasurement().getType())) {
     auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(getResult().getType());
     if (!stdvecTy || !isa<IntegerType>(stdvecTy.getElementType()))
       return emitOpError("must return a !cc.stdvec<integral> type, when "
-                         "discriminating a measurements collection");
+                         "discriminating a qreg, a series of qubits, or both");
   } else {
-    if (!isa<quake::MeasureType>(getMeasurement().getType()) ||
-        !isa<IntegerType>(getResult().getType()))
+    auto measTy = isa<quake::MeasureType>(getMeasurement().getType());
+    if (!measTy || !isa<IntegerType>(getResult().getType()))
       return emitOpError(
           "must return integral type when discriminating exactly one qubit");
   }
diff --git a/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp b/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
index d61381860ee..b127ea60e88 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
@@ -49,34 +49,6 @@ Type quake::VeqType::parse(AsmParser &parser) {
   return get(parser.getContext(), size);
 }
 
-//===----------------------------------------------------------------------===//
-// Measurements' custom parser and pretty printing.
-//
-// measurements `<` (`?` | int) `>`
-//===----------------------------------------------------------------------===//
-
-void quake::MeasurementsType::print(AsmPrinter &os) const {
-  os << '<';
-  if (hasSpecifiedSize())
-    os << getSize();
-  else
-    os << '?';
-  os << '>';
-}
-
-Type quake::MeasurementsType::parse(AsmParser &parser) {
-  if (parser.parseLess())
-    return {};
-  std::size_t size = kDynamicSize;
-  if (succeeded(parser.parseOptionalQuestion()))
-    size = kDynamicSize;
-  else if (parser.parseInteger(size))
-    return {};
-  if (parser.parseGreater())
-    return {};
-  return get(parser.getContext(), size);
-}
-
 //===----------------------------------------------------------------------===//
 
 Type quake::StruqType::parse(AsmParser &parser) {
@@ -185,6 +157,6 @@ std::size_t quake::getAllocationSize(Type ty) {
 //===----------------------------------------------------------------------===//
 
 void quake::QuakeDialect::registerTypes() {
-  addTypes<CableType, ControlType, MeasureType, MeasurementsType, RefType,
-           StateType, StruqType, VeqType, WireType>();
+  addTypes<CableType, ControlType, MeasureType, RefType, StateType, StruqType,
+           VeqType, WireType>();
 }
diff --git a/lib/Optimizer/Transforms/AddMeasurements.cpp b/lib/Optimizer/Transforms/AddMeasurements.cpp
index b3776062286..1b71702ae1a 100644
--- a/lib/Optimizer/Transforms/AddMeasurements.cpp
+++ b/lib/Optimizer/Transforms/AddMeasurements.cpp
@@ -92,14 +92,10 @@ addMeasurements(func::FuncOp funcOp, SmallVector<Operation *> &allocations,
   builder.setInsertionPointToEnd(newBlock);
   auto measTy = quake::MeasureType::get(builder.getContext());
   for (auto &[index, alloca] : llvm::enumerate(allocations)) {
-    if (auto veqTy = dyn_cast<quake::VeqType>(alloca->getResult(0).getType())) {
-      Type measurementsTy = [&]() {
-        auto *ctx = builder.getContext();
-        if (veqTy.hasSpecifiedSize())
-          return quake::MeasurementsType::get(ctx, veqTy.getSize());
-        return quake::MeasurementsType::getUnsized(ctx);
-      }();
-      builder.create<quake::MzOp>(loc, measurementsTy, alloca->getResult(0));
+    if (isa<quake::VeqType>(alloca->getResult(0).getType())) {
+      auto stdvecTy = cudaq::cc::StdvecType::get(measTy);
+      builder.create<quake::MzOp>(loc, stdvecTy,
+                                  ValueRange{alloca->getResult(0)});
     } else {
       builder.create<quake::MzOp>(loc, measTy, alloca->getResult(0));
     }
diff --git a/lib/Optimizer/Transforms/CombineMeasurements.cpp b/lib/Optimizer/Transforms/CombineMeasurements.cpp
index f3d422e65d7..5065c8aa6b6 100644
--- a/lib/Optimizer/Transforms/CombineMeasurements.cpp
+++ b/lib/Optimizer/Transforms/CombineMeasurements.cpp
@@ -109,7 +109,7 @@ class ExtendQubitMeasurePattern : public OpRewritePattern<quake::MzOp> {
   // with:
   // ```
   //   %1 = ... : !quake.veq<4>
-  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
   // ```
   // And collect output names information:  `"[[[0,[1,"q0"]],[1,[2,"q1"]]]]"`
   LogicalResult matchAndRewrite(quake::MzOp measure,
@@ -132,12 +132,7 @@ class ExtendQubitMeasurePattern : public OpRewritePattern<quake::MzOp> {
       analysis.resultQubitVals[offset] =
           std::make_pair(idx, std::to_string(idx));
 
-      Type resultType;
-      if (quake::isConstantQuantumRefType(veq.getType()))
-        resultType = quake::MeasurementsType::get(
-            measure->getContext(), quake::getAllocationSize(veq.getType()));
-      else
-        resultType = quake::MeasurementsType::getUnsized(measure->getContext());
+      auto resultType = cudaq::cc::StdvecType::get(measure.getType(0));
       if (measure == analysis.lastMeasurement) {
         rewriter.replaceOpWithNewOp<quake::MzOp>(measure, TypeRange{resultType},
                                                  ValueRange{veq},
@@ -170,12 +165,12 @@ class ExtendVeqMeasurePattern : public OpRewritePattern<quake::MzOp> {
   //   %1 = ... : !quake.veq<4>
   //   %2 = quake.subveq %1, %c1, %c2 : (!quake.veq<4>, i32, i32) ->
   //        !quake.veq<2>
-  //   %measOut = quake.mz %2 : (!quake.veq<2>) -> !quake.measurements<2>
+  //   %measOut = quake.mz %2 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   // ```
   // with:
   // ```
   //   %1 = ... : !quake.veq<4>
-  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+  //   %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
   // ```
   // And collect output names information:  `"[[[0,[1,"q0"]],[1,[2,"q1"]]]]"`
   LogicalResult matchAndRewrite(quake::MzOp measure,
@@ -208,21 +203,12 @@ class ExtendVeqMeasurePattern : public OpRewritePattern<quake::MzOp> {
         analysis.resultQubitVals[offset] = std::make_pair(i, std::to_string(i));
       }
 
-      if (measure == analysis.lastMeasurement) {
-        auto veq = subveq.getVeq();
-        Type resultType;
-        if (quake::isConstantQuantumRefType(veq.getType()))
-          resultType = quake::MeasurementsType::get(
-              measure->getContext(), quake::getAllocationSize(veq.getType()));
-        else
-          resultType =
-              quake::MeasurementsType::getUnsized(measure->getContext());
-        rewriter.replaceOpWithNewOp<quake::MzOp>(measure, TypeRange{resultType},
-                                                 ValueRange{veq},
-                                                 measure.getRegisterNameAttr());
-      } else if (measure.use_empty()) {
+      if (measure == analysis.lastMeasurement)
+        rewriter.replaceOpWithNewOp<quake::MzOp>(
+            measure, measure.getResultTypes(), ValueRange{subveq.getVeq()},
+            measure.getRegisterNameAttr());
+      else if (measure.use_empty())
         rewriter.eraseOp(measure);
-      }
 
       return success();
     }
diff --git a/lib/Optimizer/Transforms/ExpandMeasurements.cpp b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
index e0f4fc299f7..1527608dca0 100644
--- a/lib/Optimizer/Transforms/ExpandMeasurements.cpp
+++ b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
@@ -11,113 +11,68 @@
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/Todo.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 
-namespace {
 // Only an individual qubit measurement returns a bool.
 template <typename A>
 bool usesIndividualQubit(A x) {
   return x.getType() == quake::MeasureType::get(x.getContext());
 }
 
-// Pattern for expanding a multi-qubit measurement on unsized veq<?> targets
-// into a dynamic loop of individual measurements.
+// Generalized pattern for expanding a multiple qubit measurement (whether it is
+// mx, my, or mz) to a series of individual measurements.
 template <typename A>
-class ExpandUnsizedMeasurePattern : public OpRewritePattern<A> {
+class ExpandRewritePattern : public OpRewritePattern<A> {
 public:
   using OpRewritePattern<A>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(A measureOp,
                                 PatternRewriter &rewriter) const override {
-    if (usesIndividualQubit(measureOp.getMeasOut()))
-      return failure();
-
-    // Only handle the unsized case here.
-    bool hasUnsizedTarget = false;
-    for (auto v : measureOp.getTargets())
-      if (auto veqTy = dyn_cast<quake::VeqType>(v.getType()))
-        if (!veqTy.hasSpecifiedSize())
-          hasUnsizedTarget = true;
-    if (!hasUnsizedTarget)
-      return failure();
-
-    // Only expand if every user of the measurement result is a DiscriminateOp.
-    for (auto *user : measureOp.getMeasOut().getUsers())
-      if (!isa<quake::DiscriminateOp>(user))
-        return failure();
-
-    // Even without discriminate users we must expand, because downstream QIR
-    // lowering cannot handle mz on veq<?>. When discriminate users exist we
-    // additionally allocate a buffer to collect per-qubit results and build the
-    // stdvec that replaces each discriminate.
-    bool hasDiscriminateUsers = !measureOp.getMeasOut().use_empty();
     auto loc = measureOp.getLoc();
-    auto i64Ty = rewriter.getI64Type();
-    auto measTy = quake::MeasureType::get(rewriter.getContext());
-
     // 1. Determine the total number of qubits we need to measure. This
     // determines the size of the buffer of bools to create to store the results
     // in.
-    Value buff, totalToRead, buffOff, one;
-    Type elemTy, bufElemTy;
-    if (hasDiscriminateUsers) {
-      auto firstDisc = cast<quake::DiscriminateOp>(
-          *measureOp.getMeasOut().getUsers().begin());
-      auto stdvecTy =
-          cast<cudaq::cc::StdvecType>(firstDisc.getResult().getType());
-      elemTy = stdvecTy.getElementType();
-      unsigned elemWidth = cast<IntegerType>(elemTy).getWidth();
-      bufElemTy =
-          elemWidth > 8 ? elemTy : static_cast<Type>(rewriter.getI8Type());
-
-      unsigned numQubits = 0u;
-      for (auto v : measureOp.getTargets())
-        if (v.getType().template isa<quake::RefType>())
-          ++numQubits;
-      totalToRead =
-          rewriter.template create<arith::ConstantIntOp>(loc, numQubits, 64);
-      for (auto v : measureOp.getTargets())
-        if (v.getType().template isa<quake::VeqType>()) {
-          Value vecSz =
-              rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
-          totalToRead =
-              rewriter.template create<arith::AddIOp>(loc, totalToRead, vecSz);
-        }
+    unsigned numQubits = 0u;
+    for (auto v : measureOp.getTargets())
+      if (v.getType().template isa<quake::RefType>())
+        ++numQubits;
+    Value totalToRead =
+        rewriter.template create<arith::ConstantIntOp>(loc, numQubits, 64);
+    auto i64Ty = rewriter.getI64Type();
+    for (auto v : measureOp.getTargets())
+      if (v.getType().template isa<quake::VeqType>()) {
+        Value vecSz = rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
+        totalToRead =
+            rewriter.template create<arith::AddIOp>(loc, totalToRead, vecSz);
+      }
 
-      // 2. Create the buffer.
-      buff = rewriter.template create<cudaq::cc::AllocaOp>(loc, bufElemTy,
-                                                           totalToRead);
-      buffOff = rewriter.template create<arith::ConstantIntOp>(loc, 0, 64);
-      one = rewriter.template create<arith::ConstantIntOp>(loc, 1, 64);
-    }
+    // 2. Create the buffer.
+    auto i1Ty = rewriter.getI1Type();
+    auto i8Ty = rewriter.getI8Type();
+    Value buff =
+        rewriter.template create<cudaq::cc::AllocaOp>(loc, i8Ty, totalToRead);
 
     // 3. Measure each individual qubit and insert the result, in order, into
     // the buffer. For registers/vectors, loop over the entire set of qubits.
+    Value buffOff = rewriter.template create<arith::ConstantIntOp>(loc, 0, 64);
+    Value one = rewriter.template create<arith::ConstantIntOp>(loc, 1, 64);
+    auto measTy = quake::MeasureType::get(rewriter.getContext());
     for (auto v : measureOp.getTargets()) {
       if (isa<quake::RefType>(v.getType())) {
-        auto meas = rewriter.template create<A>(loc, measTy, v);
-        if (auto registerName = measureOp.getRegisterNameAttr())
-          meas.setRegisterName(registerName);
-        if (hasDiscriminateUsers) {
-          auto bit = rewriter.template create<quake::DiscriminateOp>(
-              loc, elemTy, meas.getMeasOut());
-          Value addr = rewriter.template create<cudaq::cc::ComputePtrOp>(
-              loc, cudaq::cc::PointerType::get(bufElemTy), buff, buffOff);
-          Value stored = (elemTy != bufElemTy)
-                             ? rewriter
-                                   .template create<cudaq::cc::CastOp>(
-                                       loc, bufElemTy, bit,
-                                       cudaq::cc::CastOpMode::Unsigned)
-                                   .getResult()
-                             : static_cast<Value>(bit);
-          rewriter.template create<cudaq::cc::StoreOp>(loc, stored, addr);
-          buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, one);
-        }
+        auto meas = rewriter.template create<A>(loc, measTy, v).getMeasOut();
+        auto bit =
+            rewriter.template create<quake::DiscriminateOp>(loc, i1Ty, meas);
+        Value addr = rewriter.template create<cudaq::cc::ComputePtrOp>(
+            loc, cudaq::cc::PointerType::get(i8Ty), buff, buffOff);
+        auto bitByte = rewriter.template create<cudaq::cc::CastOp>(
+            loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
+        rewriter.template create<cudaq::cc::StoreOp>(loc, bitByte, addr);
+        buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, one);
       } else {
         assert(isa<quake::VeqType>(v.getType()));
         Value vecSz = rewriter.template create<quake::VeqSizeOp>(loc, i64Ty, v);
@@ -128,138 +83,41 @@ class ExpandUnsizedMeasurePattern : public OpRewritePattern<A> {
               Value qv =
                   builder.template create<quake::ExtractRefOp>(loc, v, iv);
               auto meas = builder.template create<A>(loc, measTy, qv);
+              auto bit = builder.template create<quake::DiscriminateOp>(
+                  loc, i1Ty, meas.getMeasOut());
               if (auto registerName = measureOp.getRegisterNameAttr())
                 meas.setRegisterName(registerName);
-              if (hasDiscriminateUsers) {
-                auto bit = builder.template create<quake::DiscriminateOp>(
-                    loc, elemTy, meas.getMeasOut());
-                Value offset =
-                    builder.template create<arith::AddIOp>(loc, iv, buffOff);
-                auto addr = builder.template create<cudaq::cc::ComputePtrOp>(
-                    loc, cudaq::cc::PointerType::get(bufElemTy), buff, offset);
-                Value stored = (elemTy != bufElemTy)
-                                   ? builder
-                                         .template create<cudaq::cc::CastOp>(
-                                             loc, bufElemTy, bit,
-                                             cudaq::cc::CastOpMode::Unsigned)
-                                         .getResult()
-                                   : static_cast<Value>(bit);
-                builder.template create<cudaq::cc::StoreOp>(loc, stored, addr);
-              }
+              Value offset =
+                  builder.template create<arith::AddIOp>(loc, iv, buffOff);
+              auto addr = builder.template create<cudaq::cc::ComputePtrOp>(
+                  loc, cudaq::cc::PointerType::get(i8Ty), buff, offset);
+              auto bitByte = rewriter.template create<cudaq::cc::CastOp>(
+                  loc, i8Ty, bit, cudaq::cc::CastOpMode::Unsigned);
+              builder.template create<cudaq::cc::StoreOp>(loc, bitByte, addr);
             });
-        if (hasDiscriminateUsers)
-          buffOff =
-              rewriter.template create<arith::AddIOp>(loc, buffOff, vecSz);
+        buffOff = rewriter.template create<arith::AddIOp>(loc, buffOff, vecSz);
       }
     }
 
     // 4. Use the buffer as an initialization expression and create the
     // std::vec<bool> value.
-    if (hasDiscriminateUsers) {
-      auto stdvecTy = cudaq::cc::StdvecType::get(rewriter.getContext(), elemTy);
-      SmallVector<quake::DiscriminateOp> discs;
-      for (auto *out : measureOp.getMeasOut().getUsers())
-        if (auto disc = dyn_cast_if_present<quake::DiscriminateOp>(out))
-          discs.push_back(disc);
-      for (auto disc : discs) {
-        auto ptrArrTy =
-            cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(elemTy));
+    auto stdvecTy = cudaq::cc::StdvecType::get(rewriter.getContext(), i1Ty);
+    for (auto *out : measureOp.getMeasOut().getUsers())
+      if (auto disc = dyn_cast_if_present<quake::DiscriminateOp>(out)) {
+        auto ptrArrI1Ty =
+            cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(i1Ty));
         auto buffCast =
-            rewriter.template create<cudaq::cc::CastOp>(loc, ptrArrTy, buff);
+            rewriter.template create<cudaq::cc::CastOp>(loc, ptrArrI1Ty, buff);
         rewriter.template replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(
             disc, stdvecTy, buffCast, totalToRead);
       }
-    }
 
     rewriter.eraseOp(measureOp);
     return success();
   }
 };
 
-using MxUnsizedRewrite = ExpandUnsizedMeasurePattern<quake::MxOp>;
-using MyUnsizedRewrite = ExpandUnsizedMeasurePattern<quake::MyOp>;
-using MzUnsizedRewrite = ExpandUnsizedMeasurePattern<quake::MzOp>;
-
-// Generalized pattern for expanding a multiple qubit measurement (whether it is
-// mx, my, or mz) to a series of individual measurements.
-template <typename A>
-class ExpandRewritePattern : public OpRewritePattern<A> {
-public:
-  using OpRewritePattern<A>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(A measureOp,
-                                PatternRewriter &rewriter) const override {
-    if (usesIndividualQubit(measureOp.getMeasOut()))
-      return failure();
-
-    // Collect all the `get_measure` ops for this measurement operation.
-    SmallVector<quake::GetMeasureOp> getMeasureOps;
-    for (auto *user : measureOp.getMeasOut().getUsers())
-      if (auto gm = dyn_cast<quake::GetMeasureOp>(user))
-        getMeasureOps.push_back(gm);
-
-    // Can only replace `get_measure %m[i]` with per-qubit measurements, else
-    // bail out.
-    if (getMeasureOps.empty() && !measureOp.getMeasOut().use_empty())
-      return failure();
-
-    // Validate that all `get_measure` ops have constant indices and all the veq
-    // targets have known sizes.
-    for (auto gm : getMeasureOps)
-      if (!gm.hasConstantIndex())
-        return failure();
-    std::size_t totalMeasures = 0;
-    for (auto v : measureOp.getTargets()) {
-      if (isa<quake::RefType>(v.getType())) {
-        ++totalMeasures;
-      } else {
-        auto veqTy = cast<quake::VeqType>(v.getType());
-        if (!veqTy.hasSpecifiedSize())
-          return failure();
-        totalMeasures += veqTy.getSize();
-      }
-    }
-    // Bounds check
-    for (auto gm : getMeasureOps)
-      if (gm.getConstantIndex() >= totalMeasures)
-        return failure();
-
-    auto loc = measureOp.getLoc();
-    auto measTy = quake::MeasureType::get(rewriter.getContext());
-
-    // Create individual per-qubit measurements for each target.
-    SmallVector<Value> individualMeasures;
-    for (auto v : measureOp.getTargets()) {
-      if (isa<quake::RefType>(v.getType())) {
-        auto meas = rewriter.template create<A>(loc, measTy, v);
-        if (auto registerName = measureOp.getRegisterNameAttr())
-          meas.setRegisterName(registerName);
-        individualMeasures.push_back(meas.getMeasOut());
-      } else {
-        auto veqTy = cast<quake::VeqType>(v.getType());
-        for (std::size_t i = 0; i < veqTy.getSize(); ++i) {
-          Value idx =
-              rewriter.template create<arith::ConstantIntOp>(loc, i, 64);
-          Value qv = rewriter.template create<quake::ExtractRefOp>(loc, v, idx);
-          auto meas = rewriter.template create<A>(loc, measTy, qv);
-          if (auto registerName = measureOp.getRegisterNameAttr())
-            meas.setRegisterName(registerName);
-          individualMeasures.push_back(meas.getMeasOut());
-        }
-      }
-    }
-
-    // Replace each get_measure op with the corresponding individual result.
-    for (auto gm : getMeasureOps)
-      rewriter.replaceOp(gm, individualMeasures[gm.getConstantIndex()]);
-
-    if (measureOp.getMeasOut().use_empty())
-      rewriter.eraseOp(measureOp);
-
-    return success();
-  }
-};
-
+namespace {
 using MxRewrite = ExpandRewritePattern<quake::MxOp>;
 using MyRewrite = ExpandRewritePattern<quake::MyOp>;
 using MzRewrite = ExpandRewritePattern<quake::MzOp>;
@@ -272,10 +130,8 @@ class ResetRewrite : public OpRewritePattern<quake::ResetOp> {
 
   LogicalResult matchAndRewrite(quake::ResetOp resetOp,
                                 PatternRewriter &rewriter) const override {
-    auto veqArg = resetOp.getTargets();
-    if (!isa<quake::VeqType>(veqArg.getType()))
-      return failure();
     auto loc = resetOp.getLoc();
+    auto veqArg = resetOp.getTargets();
     auto i64Ty = rewriter.getI64Type();
     Value vecSz = rewriter.create<quake::VeqSizeOp>(loc, i64Ty, veqArg);
     cudaq::opt::factory::createInvariantLoop(
@@ -290,106 +146,29 @@ class ResetRewrite : public OpRewritePattern<quake::ResetOp> {
   }
 };
 
-// Pattern for expanding a `quake.discriminate` op on a `quake.measurements`
-// with a known size into a series of `quake.discriminate` ops on individual
-// `quake.measure` results via `quake.get_measure`.
-class ExpandDiscriminatePattern
-    : public OpRewritePattern<quake::DiscriminateOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::DiscriminateOp discOp,
-                                PatternRewriter &rewriter) const override {
-    auto measVal = discOp.getMeasurement();
-    auto measTy = dyn_cast<quake::MeasurementsType>(measVal.getType());
-    if (!measTy)
-      return failure();
-    if (!measTy.hasSpecifiedSize())
-      return failure();
-
-    auto loc = discOp.getLoc();
-    auto stdvecResTy =
-        cast<cudaq::cc::StdvecType>(discOp.getResult().getType());
-    auto elemTy = stdvecResTy.getElementType();
-    unsigned elemWidth = cast<IntegerType>(elemTy).getWidth();
-    Type bufElemTy = elemWidth > 8 ? elemTy : rewriter.getI8Type();
-
-    Value totalToRead =
-        rewriter.create<arith::ConstantIntOp>(loc, measTy.getSize(), 64);
-    Value buff =
-        rewriter.create<cudaq::cc::AllocaOp>(loc, bufElemTy, totalToRead);
-
-    // TODO: For large N, consider emitting a loop to avoid IR bloat.
-    std::size_t n = measTy.getSize();
-    for (std::size_t i = 0; i < n; ++i) {
-      Value getMeas = rewriter.create<quake::GetMeasureOp>(loc, measVal, i);
-      Value bit = rewriter.create<quake::DiscriminateOp>(loc, elemTy, getMeas);
-      Value idx = rewriter.create<arith::ConstantIntOp>(loc, i, 64);
-      Value addr = rewriter.create<cudaq::cc::ComputePtrOp>(
-          loc, cudaq::cc::PointerType::get(bufElemTy), buff, idx);
-      Value stored =
-          (elemTy != bufElemTy)
-              ? rewriter
-                    .create<cudaq::cc::CastOp>(loc, bufElemTy, bit,
-                                               cudaq::cc::CastOpMode::Unsigned)
-                    .getResult()
-              : bit;
-      rewriter.create<cudaq::cc::StoreOp>(loc, stored, addr);
-    }
-
-    auto ptrArrElemTy =
-        cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(elemTy));
-    auto buffCast = rewriter.create<cudaq::cc::CastOp>(loc, ptrArrElemTy, buff);
-    rewriter.replaceOpWithNewOp<cudaq::cc::StdvecInitOp>(discOp, stdvecResTy,
-                                                         buffCast, totalToRead);
-    return success();
-  }
-};
-
 class ExpandMeasurementsPass
     : public cudaq::opt::ExpandMeasurementsBase<ExpandMeasurementsPass> {
 public:
   void runOnOperation() override {
     auto *op = getOperation();
     auto *ctx = &getContext();
-
-    // Step 1: Expand discriminate(measurements<N>) into individual
-    // get_measure + discriminate ops. This must run first so that step 2's
-    // ExpandRewritePattern can see the resulting get_measure users.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.insert<ExpandDiscriminatePattern>(ctx);
-      ConversionTarget target(*ctx);
-      target.addLegalDialect<quake::QuakeDialect, cudaq::cc::CCDialect,
-                             arith::ArithDialect, LLVM::LLVMDialect>();
-      target.addDynamicallyLegalOp<quake::DiscriminateOp>(
-          [](quake::DiscriminateOp d) {
-            auto measTy =
-                dyn_cast<quake::MeasurementsType>(d.getMeasurement().getType());
-            if (!measTy)
-              return true;
-            return !measTy.hasSpecifiedSize();
-          });
-      if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
-        op->emitOpError("could not expand discriminate ops");
-        signalPassFailure();
-        return;
-      }
-    }
-
-    // Step 2: Expand multi-qubit m[xyz] and reset ops.
-    // ExpandRewritePattern handles sized targets (veq<N>) via unrolling.
-    // ExpandUnsizedMeasurePattern handles unsized targets (veq<?>) via
-    // dynamic loops using VeqSizeOp + createInvariantLoop.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.insert<MxRewrite, MyRewrite, MzRewrite, ResetRewrite>(ctx);
-      patterns.insert<MxUnsizedRewrite, MyUnsizedRewrite, MzUnsizedRewrite>(
-          ctx);
-      if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
-        op->emitOpError("could not expand measurements");
-        signalPassFailure();
-      }
+    RewritePatternSet patterns(ctx);
+    patterns.insert<MxRewrite, MyRewrite, MzRewrite, ResetRewrite>(ctx);
+    ConversionTarget target(*ctx);
+    target.addLegalDialect<quake::QuakeDialect, cudaq::cc::CCDialect,
+                           arith::ArithDialect, LLVM::LLVMDialect>();
+    target.addDynamicallyLegalOp<quake::MxOp>(
+        [](quake::MxOp x) { return usesIndividualQubit(x.getMeasOut()); });
+    target.addDynamicallyLegalOp<quake::MyOp>(
+        [](quake::MyOp x) { return usesIndividualQubit(x.getMeasOut()); });
+    target.addDynamicallyLegalOp<quake::MzOp>(
+        [](quake::MzOp x) { return usesIndividualQubit(x.getMeasOut()); });
+    target.addDynamicallyLegalOp<quake::ResetOp>([](quake::ResetOp r) {
+      return !isa<quake::VeqType>(r.getTargets().getType());
+    });
+    if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
+      op->emitOpError("could not expand measurements");
+      signalPassFailure();
     }
   }
 };
diff --git a/lib/Optimizer/Transforms/Mapping.cpp b/lib/Optimizer/Transforms/Mapping.cpp
index f6a09f1bf16..c7b1f33d910 100644
--- a/lib/Optimizer/Transforms/Mapping.cpp
+++ b/lib/Optimizer/Transforms/Mapping.cpp
@@ -823,8 +823,6 @@ struct MappingFunc : public cudaq::opt::impl::MappingFuncBase<MappingFunc> {
           auto measureOp = builder.create<quake::MzOp>(
               finalQubitWire[i].getLoc(), TypeRange{measTy, wireTy},
               finalQubitWire[i]);
-          /// NOTE: Eagerly discriminate here since these are terminal
-          /// measurements and would need classical readout.
           builder.create<quake::DiscriminateOp>(finalQubitWire[i].getLoc(),
                                                 resTy, measureOp.getMeasOut());
 
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index efb7bb78033..f4de7021e01 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2955,17 +2955,7 @@ def bodyBuilder(iterVar):
                     measTy = quake.MeasureType.get()
                     resTy = self.getIntegerType(1)
                 else:
-                    total_size = 0
-                    all_known = True
-                    for q in qubits:
-                        if quake.isConstantQuantumRefType(q.type):
-                            total_size += quake.getAllocationSize(q.type)
-                        else:
-                            all_known = False
-                    if all_known and total_size > 0:
-                        measTy = quake.MeasurementsType.get(total_size)
-                    else:
-                        measTy = quake.MeasurementsType.get()
+                    measTy = cc.StdvecType.get(quake.MeasureType.get())
                     resTy = cc.StdvecType.get(self.getIntegerType(1))
                 measureResult = processQuantumOperation(
                     node.func.id.title(), [],
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 82b78fd7381..16eb0569636 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -1113,24 +1113,6 @@ def reset(self, target):
                     'reset operation broadcasting on qvector not supported yet.'
                 )
 
-    @staticmethod
-    def _get_measurement_type(targets):
-        """
-        Compute the appropriate measurement type for the given targets.
-        """
-        if len(targets) == 1 and quake.RefType.isinstance(targets[0].type):
-            return quake.MeasureType.get()
-        total_size = 0
-        all_known = True
-        for t in targets:
-            if quake.isConstantQuantumRefType(t.type):
-                total_size += quake.getAllocationSize(t.type)
-            else:
-                all_known = False
-        if all_known and total_size > 0:
-            return quake.MeasurementsType.get(total_size)
-        return quake.MeasurementsType.get()
-
     def mz(self, target, regName=None):
         """
         Measure the given qubit or qubits in the Z-basis. The optional
@@ -1163,10 +1145,13 @@ def mz(self, target, regName=None):
         """
         with self.ctx, self.insertPoint, self.loc:
             i1Ty = IntegerType.get_signless(1)
-            measTy = PyKernel._get_measurement_type([target.mlirValue])
+            qubitTy = target.mlirValue.type
             retTy = i1Ty
-            if quake.MeasurementsType.isinstance(measTy):
-                retTy = cc.StdvecType.get(i1Ty)
+            measTy = quake.MeasureType.get()
+            stdvecTy = cc.StdvecType.get(i1Ty)
+            if quake.VeqType.isinstance(target.mlirValue.type):
+                retTy = stdvecTy
+                measTy = cc.StdvecType.get(measTy)
             if regName is not None:
                 res = quake.MzOp(measTy, [], [target.mlirValue],
                                  registerName=StringAttr.get(regName,
@@ -1207,10 +1192,13 @@ def mx(self, target, regName=None):
         """
         with self.ctx, self.insertPoint, self.loc:
             i1Ty = IntegerType.get_signless(1)
-            measTy = PyKernel._get_measurement_type([target.mlirValue])
+            qubitTy = target.mlirValue.type
             retTy = i1Ty
-            if quake.MeasurementsType.isinstance(measTy):
-                retTy = cc.StdvecType.get(i1Ty)
+            measTy = quake.MeasureType.get()
+            stdvecTy = cc.StdvecType.get(i1Ty)
+            if quake.VeqType.isinstance(target.mlirValue.type):
+                retTy = stdvecTy
+                measTy = cc.StdvecType.get(measTy)
             if regName is not None:
                 res = quake.MxOp(measTy, [], [target.mlirValue],
                                  registerName=StringAttr.get(regName,
@@ -1252,10 +1240,13 @@ def my(self, target, regName=None):
         """
         with self.ctx, self.insertPoint, self.loc:
             i1Ty = IntegerType.get_signless(1)
-            measTy = PyKernel._get_measurement_type([target.mlirValue])
+            qubitTy = target.mlirValue.type
             retTy = i1Ty
-            if quake.MeasurementsType.isinstance(measTy):
-                retTy = cc.StdvecType.get(i1Ty)
+            measTy = quake.MeasureType.get()
+            stdvecTy = cc.StdvecType.get(i1Ty)
+            if quake.VeqType.isinstance(target.mlirValue.type):
+                retTy = stdvecTy
+                measTy = cc.StdvecType.get(measTy)
             if regName is not None:
                 res = quake.MyOp(measTy, [], [target.mlirValue],
                                  registerName=StringAttr.get(regName,
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index 35f0b8cc217..4db6be7ed36 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -67,18 +67,6 @@ void registerQuakeDialectAndTypes(nanobind::module_ &m) {
           },
           nanobind::arg("cls"), nanobind::arg("context") = nanobind::none());
 
-  mlir::python::nanobind_adaptors::mlir_type_subclass(
-      quakeMod, "MeasurementsType",
-      [](MlirType type) { return unwrap(type).isa<quake::MeasurementsType>(); })
-      .def_classmethod(
-          "get",
-          [](nanobind::object cls, std::size_t size, MlirContext context) {
-            return wrap(quake::MeasurementsType::get(unwrap(context), size));
-          },
-          nanobind::arg("cls"),
-          nanobind::arg("size") = quake::MeasurementsType::kDynamicSize,
-          nanobind::arg("context") = nanobind::none());
-
   mlir::python::nanobind_adaptors::mlir_type_subclass(
       quakeMod, "VeqType",
       [](MlirType type) { return unwrap(type).isa<quake::VeqType>(); })
diff --git a/python/tests/mlir/adjoint.py b/python/tests/mlir/adjoint.py
index 66b1f6cfe5a..2321902b7c7 100644
--- a/python/tests/mlir/adjoint.py
+++ b/python/tests/mlir/adjoint.py
@@ -302,7 +302,7 @@ def test_sample_adjoint_qreg():
 # CHECK:           } {invariant}
 # CHECK:           call @__nvqpp__mlirgen__PythonKernelBuilderInstance{{.*}}(%[[VAL_3]]) : (!quake.veq<?>) -> ()
 # CHECK:           quake.apply<adj> @__nvqpp__mlirgen__PythonKernelBuilderInstance{{.*}} %[[VAL_3]] : (!quake.veq<?>) -> ()
-# CHECK:           %[[VAL_13:.*]] = quake.mz %0 : (!quake.veq<?>) -> !quake.measurements<?>
+# CHECK:           %[[VAL_13:.*]] = quake.mz %0 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/tests/mlir/bug_1777.py b/python/tests/mlir/bug_1777.py
index b4bfc5f5e06..0ed54ea9403 100644
--- a/python/tests/mlir/bug_1777.py
+++ b/python/tests/mlir/bug_1777.py
@@ -48,7 +48,7 @@ def test():
 # CHECK:             %[[VAL_17:.*]] = quake.discriminate %[[VAL_16]] : (!quake.measure) -> i1
 # CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_0]] : i1
 # CHECK:             cc.if(%[[VAL_18]]) {
-# CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_6]] name "inner_mz" : (!quake.veq<2>) -> !quake.measurements<2>
+# CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_6]] name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:             } else {
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_12]], %[[VAL_12]], %[[VAL_17]] : i64, i64, i1
@@ -59,7 +59,7 @@ def test():
 # CHECK:           }
 # CHECK:           %[[VAL_24:.*]] = arith.cmpi eq, %[[VAL_25:.*]]#2, %[[VAL_3]] : i1
 # CHECK:           cc.if(%[[VAL_24]]) {
-# CHECK:             %[[VAL_26:.*]] = quake.mz %[[VAL_6]] name "outer_mz" : (!quake.veq<2>) -> !quake.measurements<2>
+# CHECK:             %[[VAL_26:.*]] = quake.mz %[[VAL_6]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:           } else {
 # CHECK:           }
 # CHECK:           quake.dealloc %[[VAL_6]] : !quake.veq<2>
diff --git a/python/tests/mlir/call_qpu.py b/python/tests/mlir/call_qpu.py
index 259fd08bdae..b041f369a3e 100644
--- a/python/tests/mlir/call_qpu.py
+++ b/python/tests/mlir/call_qpu.py
@@ -55,8 +55,8 @@ def main_kernel() -> int:
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__func_achat..
 # CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", qubitMeasurementFeedback = true} {
 # CHECK:           %[[VAL_1:.*]] = arith.constant false
-# CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !quake.measurements<?>
-# CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
+# CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+# CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 # CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
 # CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 # CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
diff --git a/python/tests/mlir/measure.py b/python/tests/mlir/measure.py
index 4092412ec40..878836a4326 100644
--- a/python/tests/mlir/measure.py
+++ b/python/tests/mlir/measure.py
@@ -72,9 +72,9 @@ def test_kernel_measure_qreg():
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__PythonKernelBuilderInstance
 # CHECK-SAME: () attributes {"cudaq-entrypoint"
 # CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<3>
-# CHECK:           %[[VAL_1:.*]] = quake.mx %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
-# CHECK:           %[[VAL_2:.*]] = quake.my %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
-# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
+# CHECK:           %[[VAL_1:.*]] = quake.mx %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
+# CHECK:           %[[VAL_2:.*]] = quake.my %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
+# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/tests/mlir/swap.py b/python/tests/mlir/swap.py
index dd5701e9922..a6b8cbce7e3 100644
--- a/python/tests/mlir/swap.py
+++ b/python/tests/mlir/swap.py
@@ -40,7 +40,7 @@ def test_swap_2q():
 # CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<2>) -> !quake.ref
 # CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
 # CHECK:           quake.swap %[[VAL_1]], %[[VAL_2]] : (!quake.ref, !quake.ref) -> ()
-# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+# CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 # CHECK:           return
 # CHECK:         }
 
diff --git a/runtime/cudaq/builder/QuakeValue.cpp b/runtime/cudaq/builder/QuakeValue.cpp
index e6c5137211e..f9c1f25c618 100644
--- a/runtime/cudaq/builder/QuakeValue.cpp
+++ b/runtime/cudaq/builder/QuakeValue.cpp
@@ -102,7 +102,7 @@ std::size_t QuakeValue::getRequiredElements() {
 QuakeValue QuakeValue::operator[](const std::size_t idx) {
   Value vectorValue = value->asMLIR();
   Type type = vectorValue.getType();
-  if (!isa<cc::StdvecType, quake::VeqType, quake::MeasurementsType>(type)) {
+  if (!isa<cc::StdvecType, quake::VeqType>(type)) {
     std::string typeName;
     {
       llvm::raw_string_ostream os(typeName);
@@ -113,11 +113,6 @@ QuakeValue QuakeValue::operator[](const std::size_t idx) {
                              typeName + ").");
   }
 
-  if (isa<quake::MeasurementsType>(type)) {
-    Value measure = opBuilder.create<quake::GetMeasureOp>(vectorValue, idx);
-    return QuakeValue(opBuilder, measure);
-  }
-
   Value indexVar = opBuilder.create<arith::ConstantIntOp>(idx, 32);
 
   if (isa<quake::VeqType>(type)) {
@@ -144,7 +139,7 @@ QuakeValue QuakeValue::operator[](const std::size_t idx) {
 QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
   Value vectorValue = value->asMLIR();
   Type type = vectorValue.getType();
-  if (!isa<cc::StdvecType, quake::VeqType, quake::MeasurementsType>(type)) {
+  if (!isa<cc::StdvecType, quake::VeqType>(type)) {
     std::string typeName;
     {
       llvm::raw_string_ostream os(typeName);
@@ -157,12 +152,6 @@ QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
 
   Value indexVar = idx.getValue();
 
-  if (isa<quake::MeasurementsType>(type)) {
-    Value measure =
-        opBuilder.create<quake::GetMeasureOp>(vectorValue, indexVar);
-    return QuakeValue(opBuilder, measure);
-  }
-
   if (isa<quake::VeqType>(type)) {
     Value extractedQubit =
         opBuilder.create<quake::ExtractRefOp>(vectorValue, indexVar);
@@ -186,15 +175,13 @@ QuakeValue QuakeValue::operator[](const QuakeValue &idx) {
 QuakeValue QuakeValue::size() {
   Value vectorValue = value->asMLIR();
   Type type = vectorValue.getType();
-  if (!isa<cc::StdvecType, quake::VeqType, quake::MeasurementsType>(type))
+  if (!isa<cc::StdvecType, quake::VeqType>(type))
     throw std::runtime_error("This QuakeValue does not expose .size().");
 
   Type i64Ty = opBuilder.getI64Type();
   Value ret;
   if (isa<cc::StdvecType>(type))
     ret = opBuilder.create<cc::StdvecSizeOp>(i64Ty, vectorValue);
-  else if (isa<quake::MeasurementsType>(type))
-    ret = opBuilder.create<quake::MeasurementsSizeOp>(i64Ty, vectorValue);
   else
     ret = opBuilder.create<quake::VeqSizeOp>(i64Ty, vectorValue);
 
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index d134918d287..474d3834150 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -784,14 +784,11 @@ QuakeValue applyMeasure(ImplicitLocOpBuilder &builder, Value value,
   if (!regName.empty())
     strAttr = builder.getStringAttr(regName);
 
+  Type resTy = builder.getI1Type();
   Type measTy = quake::MeasureType::get(builder.getContext());
   if (!isa<quake::RefType>(type)) {
-    if (auto veqTy = dyn_cast<quake::VeqType>(type);
-        veqTy && veqTy.hasSpecifiedSize())
-      measTy =
-          quake::MeasurementsType::get(builder.getContext(), veqTy.getSize());
-    else
-      measTy = quake::MeasurementsType::getUnsized(builder.getContext());
+    resTy = cc::StdvecType::get(resTy);
+    measTy = cc::StdvecType::get(measTy);
   }
   Value measureResult;
   if (strAttr)
@@ -802,7 +799,8 @@ QuakeValue applyMeasure(ImplicitLocOpBuilder &builder, Value value,
     measureResult =
         builder.template create<QuakeMeasureOp>(measTy, value).getMeasOut();
 
-  return QuakeValue(builder, measureResult);
+  Value bits = builder.create<quake::DiscriminateOp>(resTy, measureResult);
+  return QuakeValue(builder, bits);
 }
 
 QuakeValue mx(ImplicitLocOpBuilder &builder, QuakeValue &qubitOrQvec,
diff --git a/runtime/cudaq/qis/execution_manager.h b/runtime/cudaq/qis/execution_manager.h
index f57b3cbbd82..585496aca89 100644
--- a/runtime/cudaq/qis/execution_manager.h
+++ b/runtime/cudaq/qis/execution_manager.h
@@ -34,6 +34,45 @@ struct QuditInfo {
   }
 };
 
+extern "C" {
+bool __nvqpp__MeasureResultBoolConversion(int);
+}
+
+#ifdef CUDAQ_LIBRARY_MODE
+
+/// In library mode, we model the return type of a qubit measurement result via
+/// the measure_result type. This allows us to keep track of when the result is
+/// implicitly cast to a boolean (likely in the case of conditional feedback),
+/// and affect the simulation accordingly.
+class measure_result {
+private:
+  /// The intrinsic measurement result
+  int result = 0;
+
+  /// Unique integer for measure result identification
+  std::size_t uniqueId = 0;
+
+public:
+  measure_result(int res, std::size_t id) : result(res), uniqueId(id) {}
+  measure_result(int res) : result(res) {}
+
+  operator int() const { return result; }
+  operator bool() const { return __nvqpp__MeasureResultBoolConversion(result); }
+
+  static std::vector<bool>
+  to_bool_vector(const std::vector<measure_result> &results) {
+    std::vector<bool> boolResults;
+    boolResults.reserve(results.size());
+    for (const auto &res : results)
+      boolResults.push_back(static_cast<bool>(res));
+    return boolResults;
+  }
+};
+#else
+/// When compiling with MLIR, we default to a boolean.
+using measure_result = bool;
+#endif
+
 /// The ExecutionManager provides a base class describing a concrete sub-system
 /// for allocating qudits and executing quantum instructions on those qudits.
 /// This type is templated on the concrete qudit type (`qubit`, `qmode`, etc).
diff --git a/runtime/cudaq/qis/execution_manager_c_api.cpp b/runtime/cudaq/qis/execution_manager_c_api.cpp
index 28ccf4a7205..f90a420690d 100644
--- a/runtime/cudaq/qis/execution_manager_c_api.cpp
+++ b/runtime/cudaq/qis/execution_manager_c_api.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/platform.h"
-#include "measure_result.h"
+#include "execution_manager.h"
 
 bool cudaq::__nvqpp__MeasureResultBoolConversion(int result) {
   auto &platform = get_platform();
diff --git a/runtime/cudaq/qis/measure_result.h b/runtime/cudaq/qis/measure_result.h
deleted file mode 100644
index c2032d3cf48..00000000000
--- a/runtime/cudaq/qis/measure_result.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/****************************************************************-*- C++ -*-****
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#pragma once
-
-#include <cstdint>
-#include <limits>
-
-namespace cudaq {
-
-extern "C" {
-bool __nvqpp__MeasureResultBoolConversion(int);
-}
-
-/// We model the return type of a qubit measurement result via the
-/// `measure_result` type. This allows us to keep track of when the result is
-/// implicitly cast to a boolean (likely in the case of conditional feedback),
-/// and affect the simulation accordingly.
-///
-/// TODO: A companion `measure_vector` type will replace
-/// `std::vector<measure_result>` for multi-qubit measurements (see spec).
-class measure_result {
-public:
-  /// The intrinsic measurement value
-  std::int64_t value = 0;
-
-  /// Unique integer for measure result identification.
-  /// INT64_MAX means unassigned; negative values are valid
-  std::int64_t unique_id = std::numeric_limits<std::int64_t>::max();
-
-  // No default construction (measurements must come from mz/mx/my).
-  // No assignment (measurement collections are immutable).
-  measure_result() = delete;
-  measure_result(const measure_result &) = default;
-  measure_result(measure_result &&) = default;
-  measure_result &operator=(const measure_result &) = delete;
-  measure_result &operator=(measure_result &&) = delete;
-
-  explicit measure_result(int64_t val) : value(val) {}
-  explicit measure_result(int64_t val, int64_t id)
-      : value(val), unique_id(id) {}
-
-  // Operator overloads for conversions and comparisons
-#ifdef CUDAQ_LIBRARY_MODE
-  operator bool() const { return __nvqpp__MeasureResultBoolConversion(value); }
-#else
-  operator bool() const { return value == 1; }
-#endif
-  explicit operator int() const { return static_cast<int>(value); }
-  explicit operator double() const { return static_cast<double>(value); }
-
-  friend bool operator==(const measure_result &m1, const measure_result &m2) {
-    return (m1.value == m2.value) && (m1.unique_id == m2.unique_id);
-  }
-  friend bool operator==(const measure_result &m, bool b) {
-    return static_cast<bool>(m) == b;
-  }
-  friend bool operator==(bool b, const measure_result &m) {
-    return b == static_cast<bool>(m);
-  }
-
-  friend bool operator!=(const measure_result &m1, const measure_result &m2) {
-    return (m1.value != m2.value) || (m1.unique_id != m2.unique_id);
-  }
-  friend bool operator!=(const measure_result &m, bool b) {
-    return static_cast<bool>(m) != b;
-  }
-  friend bool operator!=(bool b, const measure_result &m) {
-    return b != static_cast<bool>(m);
-  }
-};
-
-} // namespace cudaq
diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index c578c5c23a6..6e3df5db04f 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -12,7 +12,6 @@
 #include "cudaq/host_config.h"
 #include "cudaq/operators.h"
 #include "cudaq/platform.h"
-#include "cudaq/qis/measure_result.h"
 #include "cudaq/qis/modifiers.h"
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/qis/qarray.h"
@@ -421,33 +420,29 @@ void exp_pauli(QuantumRegister &ctrls, double theta, const char *pauliWord,
                                false, spin_op::from_word(pauliWord));
 }
 
-/// @brief Measure an individual qubit, return as `measure_result`
+/// @brief Measure an individual qubit, return 0,1 as `bool`
 inline measure_result mz(qubit &q) {
-  return measure_result(
-      getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()}));
+  return getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()});
 }
 
-/// @brief Measure an individual qubit in `x` basis, return as `measure_result`
+/// @brief Measure an individual qubit in `x` basis, return 0,1 as `bool`
 inline measure_result mx(qubit &q) {
   h(q);
-  return measure_result(
-      getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()}));
+  return getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()});
 }
 
-// Measure an individual qubit in `y` basis, return as `measure_result`
+// Measure an individual qubit in `y` basis, return 0,1 as `bool`
 inline measure_result my(qubit &q) {
   r1(-M_PI_2, q);
   h(q);
-  return measure_result(
-      getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()}));
+  return getExecutionManager()->measure(QuditInfo{q.n_levels(), q.id()});
 }
 
 inline void reset(qubit &q) {
   getExecutionManager()->reset({q.n_levels(), q.id()});
 }
 
-// Measure all qubits in the range.
-// TODO: return type will change to cudaq::measure_vector (see spec).
+// Measure all qubits in the range, return vector of 0,1
 template <typename QubitRange>
   requires std::ranges::range<QubitRange>
 std::vector<measure_result> mz(QubitRange &q) {
@@ -478,8 +473,7 @@ std::vector<measure_result> mz(QubitRange &qr, Qs &&...qs) {
   if constexpr (std::is_same_v<decltype(rest), measure_result>) {
     result.push_back(rest);
   } else {
-    for (const auto &r : rest)
-      result.push_back(r);
+    result.insert(result.end(), rest.begin(), rest.end());
   }
   return result;
 }
@@ -491,8 +485,7 @@ std::vector<measure_result> mz(qubit &q, Qs &&...qs) {
   if constexpr (std::is_same_v<decltype(rest), measure_result>) {
     result.push_back(rest);
   } else {
-    for (const auto &r : rest)
-      result.push_back(r);
+    result.insert(result.end(), rest.begin(), rest.end());
   }
   return result;
 }
@@ -513,7 +506,8 @@ inline SpinMeasureResult measure(const cudaq::spin_op &term) {
   return getExecutionManager()->measure(term);
 }
 
-// TODO: will become measure_vector::operator std::int64_t() (see spec).
+// Cast a measure register to an int64_t.
+// This function is classic control code that may run on a QPU.
 inline std::int64_t to_integer(const std::vector<measure_result> &bits) {
   std::int64_t ret = 0;
   for (std::size_t i = 0; i < bits.size(); i++) {
@@ -524,33 +518,12 @@ inline std::int64_t to_integer(const std::vector<measure_result> &bits) {
   return ret;
 }
 
-inline std::int64_t to_integer(const std::vector<bool> &bits) {
-  std::int64_t ret = 0;
-  for (std::size_t i = 0; i < bits.size(); i++) {
-    if (bits[i]) {
-      ret |= 1UL << i;
-    }
-  }
-  return ret;
-}
-
 inline std::int64_t to_integer(const std::string &arg) {
   std::string bitString{arg};
   std::reverse(bitString.begin(), bitString.end());
   return std::stoull(bitString, nullptr, 2);
 }
 
-// TODO: will be replaced by measure_vector::operator std::vector<bool>() (see
-// spec).
-inline std::vector<bool>
-to_bool_vector(const std::vector<measure_result> &results) {
-  std::vector<bool> out;
-  out.reserve(results.size());
-  for (const auto &r : results)
-    out.push_back(static_cast<bool>(r));
-  return out;
-}
-
 // This concept tests if `Kernel` is a `Callable` that takes the arguments,
 // `Args`, and returns `void`.
 template <typename Kernel, typename... Args>
diff --git a/runtime/nvqir/QIRTypes.cpp b/runtime/nvqir/QIRTypes.cpp
index f566c62715d..ec0b5f21f94 100644
--- a/runtime/nvqir/QIRTypes.cpp
+++ b/runtime/nvqir/QIRTypes.cpp
@@ -153,20 +153,12 @@ Array *__quantum__rt__array_create_1d(int32_t itemSizeInBytes,
   return array;
 }
 
-Array *__quantum__rt__result_array_create_1d(std::int64_t count) {
-  return __quantum__rt__array_create_1d(sizeof(void *), count);
-}
-
 int8_t *__quantum__rt__array_get_element_ptr_1d(Array *q, uint64_t idx) {
   Array &arr = *q;
   int8_t *ptr = arr[idx];
   return ptr;
 }
 
-int8_t *__quantum__rt__result_array_get_element_ptr_1d(Array *q, uint64_t idx) {
-  return __quantum__rt__array_get_element_ptr_1d(q, idx);
-}
-
 int64_t __quantum__rt__array_get_size_1d(Array *state1) {
   if (state1 == nullptr)
     return 0;
diff --git a/runtime/nvqir/QIRTypes.h b/runtime/nvqir/QIRTypes.h
index 5d9b495f5cf..1673a1cc4c0 100644
--- a/runtime/nvqir/QIRTypes.h
+++ b/runtime/nvqir/QIRTypes.h
@@ -83,17 +83,15 @@ Array *__quantum__rt__array_slice(Array *array, int32_t dim,
 Array *__quantum__rt__array_slice_1d(Array *array, int64_t range_start,
                                      int64_t range_step, int64_t range_end);
 Array *quantum__rt__array_slice(Array *array, int32_t dim, Range range);
-Array *__quantum__rt__result_array_create_1d(std::int64_t count);
-int8_t *__quantum__rt__result_array_get_element_ptr_1d(Array *q, uint64_t idx);
 
 // Internal method to clean up any dangling arrays
 void __nvqpp_cleanup_arrays();
 }
 
 // Results
-using Result = int;
-static const Result ResultZeroVal = 0;
-static const Result ResultOneVal = 1;
+using Result = bool;
+static const Result ResultZeroVal = false;
+static const Result ResultOneVal = true;
 inline Result *ResultZero = const_cast<Result *>(&ResultZeroVal);
 inline Result *ResultOne = const_cast<Result *>(&ResultOneVal);
 
diff --git a/targettests/Kernel/inline-qpu-func.cpp b/targettests/Kernel/inline-qpu-func.cpp
index 948165af55d..9d0dae31c17 100644
--- a/targettests/Kernel/inline-qpu-func.cpp
+++ b/targettests/Kernel/inline-qpu-func.cpp
@@ -10,8 +10,9 @@
 
 #include "cudaq.h"
 
-// This is device only kernel since entry-point kernels cannot accept
-// `measure_result` or `std::vector<measure_result>` as parameters.
+// This function has no cudaq::qubit's in the parameter list, so it will be
+// tagged as a possible cudaq-entrypoint kernel. Make sure we can still inline
+// it if called from another kernel.
 bool xor_result(const std::vector<cudaq::measure_result> &result_vec) __qpu__ {
   bool result = false;
   for (auto x : result_vec)
diff --git a/targettests/Kernel/signature-0.cpp b/targettests/Kernel/signature-0.cpp
index 6153ff3cc8d..fe853eee719 100644
--- a/targettests/Kernel/signature-0.cpp
+++ b/targettests/Kernel/signature-0.cpp
@@ -53,7 +53,7 @@ class Qernel5 {
 public:
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(5);
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
@@ -61,7 +61,7 @@ class Qernel6 {
 public:
   std::vector<bool> operator()(int sz) __qpu__ {
     cudaq::qvector q(sz);
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
diff --git a/targettests/execution/auto_kernel.cpp b/targettests/execution/auto_kernel.cpp
index bbe105cd22d..61b484d6348 100644
--- a/targettests/execution/auto_kernel.cpp
+++ b/targettests/execution/auto_kernel.cpp
@@ -20,7 +20,7 @@ struct ak2 {
     h(q[0]);
     x(q[1]);
     y(q[2]);
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
diff --git a/targettests/execution/conditional_run.cpp b/targettests/execution/conditional_run.cpp
index 7780f567aa4..99ddf388188 100644
--- a/targettests/execution/conditional_run.cpp
+++ b/targettests/execution/conditional_run.cpp
@@ -17,7 +17,7 @@
 #include <cudaq.h>
 
 struct kernel {
-  bool operator()() __qpu__ {
+  auto operator()() __qpu__ {
     cudaq::qarray<3> q;
     // Initial state prep
     x(q[0]);
diff --git a/targettests/execution/cudaq_run.cpp b/targettests/execution/cudaq_run.cpp
index 97294f1bdd4..c3b14378eb7 100644
--- a/targettests/execution/cudaq_run.cpp
+++ b/targettests/execution/cudaq_run.cpp
@@ -53,7 +53,11 @@ struct vector_mz_test {
     cudaq::qvector q(5);
     cudaq::qubit p;
     x(q);
-    return cudaq::to_bool_vector(mz(q));
+#ifdef CUDAQ_LIBRARY_MODE
+    return cudaq::measure_result::to_bool_vector(mz(q));
+#else
+    return mz(q);
+#endif
   }
 };
 
@@ -90,18 +94,6 @@ auto struct_test = []() __qpu__ {
   return t;
 };
 
-__qpu__ auto return_mz() {
-  cudaq::qubit q;
-  h(q);
-  return static_cast<bool>(mz(q));
-}
-
-__qpu__ auto return_vector_mz() {
-  cudaq::qvector q(3);
-  x(q);
-  return cudaq::to_bool_vector(mz(q));
-}
-
 int main() {
   std::size_t shots = 10;
   int c = 0;
@@ -263,34 +255,6 @@ int main() {
     }
   }
 
-  {
-    const auto results = cudaq::run(shots, return_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto i : results)
-        printf("%d: %d\n", c++, (bool)i);
-      printf("success - return_mz\n");
-    }
-  }
-
-  {
-    const auto results = cudaq::run(shots, return_vector_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto &vec : results) {
-        printf("%d: {", c++);
-        for (auto b : vec)
-          printf("%d ", static_cast<bool>(b));
-        printf("}\n");
-      }
-      printf("success - return_vector_mz\n");
-    }
-  }
-
   return 0;
 }
 
@@ -304,5 +268,3 @@ int main() {
 // CHECK: success - vector_int_test
 // CHECK: success - vector_float_test
 // CHECK: success - struct_test
-// CHECK: success - return_mz
-// CHECK: success - return_vector_mz
diff --git a/targettests/execution/cudaq_run_dynamic_result.cpp b/targettests/execution/cudaq_run_dynamic_result.cpp
index c46ee18a389..6b240f1fffc 100644
--- a/targettests/execution/cudaq_run_dynamic_result.cpp
+++ b/targettests/execution/cudaq_run_dynamic_result.cpp
@@ -17,13 +17,13 @@
 __qpu__ std::vector<bool> arg_size_bool(int n) {
   cudaq::qvector qs(n);
   x(qs);
-  return cudaq::to_bool_vector(mz(qs));
+  return mz(qs);
 }
 
 __qpu__ std::vector<int> arg_size_int(int n) {
   cudaq::qvector qs(n);
   x(qs);
-  auto bits = cudaq::to_bool_vector(mz(qs));
+  auto bits = mz(qs);
   std::vector<int> result(n);
   for (int i = 0; i < n; i++)
     result[i] = bits[i] ? 1 : 0;
@@ -33,7 +33,7 @@ __qpu__ std::vector<int> arg_size_int(int n) {
 __qpu__ std::vector<float> arg_size_float(int n) {
   cudaq::qvector qs(n);
   x(qs);
-  auto bits = cudaq::to_bool_vector(mz(qs));
+  auto bits = mz(qs);
   std::vector<float> result(n);
   for (int i = 0; i < n; i++)
     result[i] = bits[i] ? 1.0f : 0.0f;
@@ -49,7 +49,7 @@ __qpu__ std::vector<bool> branch_vec_test(bool flip) {
   bool b = mz(ctrl);
   int sz = b ? 2 : 4;
   cudaq::qvector data(sz);
-  return cudaq::to_bool_vector(mz(data));
+  return mz(data);
 }
 
 int main() {
diff --git a/targettests/execution/cudaq_run_emulation.cpp b/targettests/execution/cudaq_run_emulation.cpp
index e4176af6c51..1b73dca5023 100644
--- a/targettests/execution/cudaq_run_emulation.cpp
+++ b/targettests/execution/cudaq_run_emulation.cpp
@@ -6,6 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+
 // clang-format off
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t 2>&1 | FileCheck %s -check-prefix=FAIL
 // RUN: nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
@@ -29,19 +30,7 @@ __qpu__ int test_kernel(int count) {
 __qpu__ std::vector<bool> mz_test(int count) {
   cudaq::qvector v(count);
   h(v);
-  return cudaq::to_bool_vector(mz(v));
-}
-
-__qpu__ bool return_mz() {
-  cudaq::qubit q;
-  h(q);
-  return mz(q);
-}
-
-__qpu__ auto return_vector_mz() {
-  cudaq::qvector q(3);
-  x(q);
-  return cudaq::to_bool_vector(mz(q));
+  return mz(v);
 }
 
 int main() {
@@ -79,39 +68,9 @@ int main() {
     }
   }
 
-  {
-    const auto results = cudaq::run(shots, return_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto i : results)
-        printf("%d: %d\n", c++, (bool)i);
-      printf("success - return_mz\n");
-    }
-  }
-
-  {
-    const auto results = cudaq::run(shots, return_vector_mz);
-    if (results.size() != shots) {
-      printf("FAILED! Expected %lu shots. Got %lu\n", shots, results.size());
-    } else {
-      c = 0;
-      for (auto &vec : results) {
-        printf("%d: {", c++);
-        for (auto b : vec)
-          printf("%d ", (bool)b);
-        printf("}\n");
-      }
-      printf("success - return_vector_mz\n");
-    }
-  }
-
   return 0;
 }
 
 // FAIL: `run` is not yet supported on this target
 // CHECK: success!
 // CHECK: success async!
-// CHECK: success - return_mz
-// CHECK: success - return_vector_mz
diff --git a/targettests/execution/qir_cond_for_loop-3.cpp b/targettests/execution/qir_cond_for_loop-3.cpp
index 4285c37c829..a13e27ff08b 100644
--- a/targettests/execution/qir_cond_for_loop-3.cpp
+++ b/targettests/execution/qir_cond_for_loop-3.cpp
@@ -11,12 +11,6 @@
 // RUN: CUDAQ_DEFAULT_SIMULATOR=stim nvq++ --target quantinuum --quantinuum-machine Helios-1SC --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
 
-// Original test used `std::vector<cudaq::measure_result>(n)` with element
-// assignment, both of which are incompatible with measure_result's deleted
-// default ctor and deleted assignment operators. Rewritten to use a loop-local
-// `measure_result` variable, retaining the mid-circuit measurement +
-// conditional parity pattern with deferred discrimination.
-
 #include <cudaq.h>
 #include <iostream>
 
@@ -24,10 +18,11 @@ struct kernel {
   bool operator()(const int n_iter) __qpu__ {
     cudaq::qubit q0;
     cudaq::qubit q1;
+    std::vector<cudaq::measure_result> resultVector(n_iter);
     for (int i = 0; i < n_iter; i++) {
       h(q0);
-      cudaq::measure_result result = mz(q0);
-      if (result)
+      resultVector[i] = mz(q0);
+      if (resultVector[i])
         x(q1); // toggle q1 on every q0 coin toss that lands heads
     }
     return mz(q1); // the measured q1 should contain the parity bit for
diff --git a/targettests/execution/qubit_management_if_classical.cpp b/targettests/execution/qubit_management_if_classical.cpp
index 4fcf35fdfe3..9f47d62291b 100644
--- a/targettests/execution/qubit_management_if_classical.cpp
+++ b/targettests/execution/qubit_management_if_classical.cpp
@@ -22,7 +22,7 @@ struct run_test {
       rx(1., p);
       y(p);
     }
-    bool res = mz(p);
+    auto res = mz(p);
     return res;
   }
 };
diff --git a/targettests/quantinuum/reset_after_measure.cpp b/targettests/quantinuum/reset_after_measure.cpp
index 7c932813c04..c994d79ef67 100644
--- a/targettests/quantinuum/reset_after_measure.cpp
+++ b/targettests/quantinuum/reset_after_measure.cpp
@@ -9,10 +9,6 @@
 // clang-format off
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // clang-format on
-// XFAIL: *
-// TODO: QIR adaptive profile fails to materialize Result* -> Array* conversion
-// for get_measure results. Needs QIR lowering fix for MeasurementsType in
-// the adaptive profile pipeline.
 
 #include <cudaq.h>
 #include <cudaq/algorithms/resource_estimation.h>
diff --git a/test/AST-Quake/auto_kernel-1.cpp b/test/AST-Quake/auto_kernel-1.cpp
index 0a994388858..e695831d348 100644
--- a/test/AST-Quake/auto_kernel-1.cpp
+++ b/test/AST-Quake/auto_kernel-1.cpp
@@ -22,11 +22,14 @@ struct ak1 {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ak1(
-// CHECK-SAME:      %[[VAL_0:.*]]: i32{{.*}}) -> !quake.measure attributes
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] name "vec" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:           %[[VAL_5:.*]] = quake.get_measure %[[VAL_4]][%[[VAL_1]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:           return %[[VAL_5]] : !quake.measure
+// CHECK-SAME:      %[[VAL_0:.*]]: i32{{.*}}) -> i1 attributes
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] name "vec" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_3]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_7]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_5:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_6]] : (i8) -> i1
+// CHECK:           return %[[VAL_8]] : i1
diff --git a/test/AST-Quake/auto_kernel-2.cpp b/test/AST-Quake/auto_kernel-2.cpp
index 1a95553e461..d42a07a011c 100644
--- a/test/AST-Quake/auto_kernel-2.cpp
+++ b/test/AST-Quake/auto_kernel-2.cpp
@@ -21,12 +21,17 @@ struct ak2 {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__ak2
-// CHECK-SAME:      () -> !quake.measurements<?> attributes
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<5>
-// CHECK:           cc.loop while
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<5>) -> !quake.measurements<5>
-// CHECK:           %[[VAL_2:.*]] = quake.relax_size %[[VAL_1]] : (!quake.measurements<5>) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_2]] : !quake.measurements<?>
+// CHECK-SAME: () -> !cc.stdvec<i1> attributes {
+// CHECK:           %[[VAL_22:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_19:.*]] = quake.mz %{{.*}} : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_1:.*]] = quake.discriminate %[[VAL_19]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_20:.*]] = cc.stdvec_data %[[VAL_1]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_21:.*]] = cc.stdvec_size %[[VAL_1]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_23:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_24:.*]] = cc.stdvec_init %[[VAL_23]], %[[VAL_21]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_24]] : !cc.stdvec<i1>
 // CHECK:         }
 // CHECK-NOT:   func.func {{.*}} @_ZNKSt14_Bit_referencecvbEv() -> i1
+// CHECK-LABEL: func.func private @__nvqpp_vectorCopyCtor(
+// CHECK-NOT:   func.func {{.*}} @_ZNKSt14_Bit_referencecvbEv() -> i1
 
diff --git a/test/AST-Quake/base_profile-0.cpp b/test/AST-Quake/base_profile-0.cpp
index 1ea649c4056..0067901d282 100644
--- a/test/AST-Quake/base_profile-0.cpp
+++ b/test/AST-Quake/base_profile-0.cpp
@@ -34,7 +34,6 @@ struct kernel {
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
 // CHECK:         tail call void @__quantum__qis__mz__body(%{{.*}}* null, %{{.*}}* null)
 // CHECK:         tail call void @__quantum__qis__mz__body(%{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*), %{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*))
-// CHECK:         tail call void @__quantum__rt__array_record_output(i64 2, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.61727261793C6931207820323E00, i64 0, i64 0))
 // CHECK:         tail call void @__quantum__rt__result_record_output(%{{.*}}* null, i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
 // CHECK:         tail call void @__quantum__rt__result_record_output(%{{.*}}* nonnull inttoptr (i64 1 to %{{.*}}*), i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
 // clang-format on
diff --git a/test/AST-Quake/qir_profiles.cpp b/test/AST-Quake/base_profile-1.cpp
similarity index 97%
rename from test/AST-Quake/qir_profiles.cpp
rename to test/AST-Quake/base_profile-1.cpp
index 27020e23fb0..231baa16041 100644
--- a/test/AST-Quake/qir_profiles.cpp
+++ b/test/AST-Quake/base_profile-1.cpp
@@ -89,13 +89,6 @@ struct comprehensive {
   }
 };
 
-struct adapt_mz_read {
-  bool operator()() __qpu__ {
-    cudaq::qubit q;
-    return static_cast<bool>(mz(q));
-  }
-};
-
 // clang-format off
 
 // BASE-LABEL: define void @__nvqpp__mlirgen__comprehensive()
@@ -175,7 +168,6 @@ struct adapt_mz_read {
 // ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__z__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*))
-// ADAPT:         tail call void @__quantum__qis__cz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Qubit* nonnull inttoptr (i64 4 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* null)
 // ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__t__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*))
@@ -201,31 +193,28 @@ struct adapt_mz_read {
 // ADAPT:         tail call void @__quantum__qis__swap__body(%Qubit* null, %Qubit* nonnull inttoptr (i64 6 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__u3__body(double 8.000000e-01, double 5.000000e-01, double -1.000000e+00, %Qubit* nonnull inttoptr (i64 3 to %Qubit*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* null, %Result* null)
-// ADAPT:         tail call void @__quantum__rt__array_record_output(i64 7, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.61727261793C6931207820373E00, i64 0, i64 0))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* null, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @cstr.73696E676C65746F6E00, i64 0, i64 0))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 1 to %Qubit*), %Result* nonnull inttoptr (i64 1 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 1 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.65696E7300, i64 0, i64 0))
+// ADAPT:         %[[VAL_2:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 1 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 2 to %Qubit*), %Result* nonnull inttoptr (i64 2 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 2 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
+// ADAPT:         %[[VAL_3:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 2 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 3 to %Qubit*), %Result* nonnull inttoptr (i64 3 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 3 to %Result*), i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @cstr.64756200, i64 0, i64 0))
+// ADAPT:         %[[VAL_4:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 3 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 4 to %Qubit*), %Result* nonnull inttoptr (i64 4 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 4 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// ADAPT:         %[[VAL_5:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 4 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 5 to %Qubit*), %Result* nonnull inttoptr (i64 5 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 5 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// ADAPT:         %[[VAL_6:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 5 to %Result*))
 // ADAPT:         tail call void @__quantum__qis__mz__body(%Qubit* nonnull inttoptr (i64 6 to %Qubit*), %Result* nonnull inttoptr (i64 6 to %Result*))
 // ADAPT:         tail call void @__quantum__rt__result_record_output(%Result* nonnull inttoptr (i64 6 to %Result*), i8* nonnull getelementptr inbounds ([5 x i8], [5 x i8]* @cstr.7472697000, i64 0, i64 0))
+// ADAPT:         %[[VAL_7:.*]] = tail call i1 @__quantum__qis__read_result__body(%Result* nonnull inttoptr (i64 6 to %Result*))
 // ADAPT:         ret void
 // ADAPT:       }
 
-// ADAPT-LABEL: define i1 @__nvqpp__mlirgen__adapt_mz_read()
-// ADAPT:         tail call void @__quantum__qis__mz__body(%[[VAL_2:.*]]* null, %[[VAL_3:.*]]* null)
-// ADAPT:         tail call void @__quantum__rt__array_record_output(i64 1, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// ADAPT:         tail call void @__quantum__rt__result_record_output(%[[VAL_3]]* null, i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @cstr.{{.*}}, i64 0, i64 0))
-// ADAPT:         %[[VAL_4:.*]] = tail call i1 @__quantum__qis__read_result__body(%[[VAL_3]]* null)
-// ADAPT:         ret i1 %[[VAL_4]]
-// ADAPT:       }
-
 // FULL-LABEL: define void @__nvqpp__mlirgen__comprehensive()
 // FULL:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 9)
 // FULL:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
diff --git a/test/AST-Quake/bool_literal.cpp b/test/AST-Quake/bool_literal.cpp
index 5df0fe63859..48578561bc0 100644
--- a/test/AST-Quake/bool_literal.cpp
+++ b/test/AST-Quake/bool_literal.cpp
@@ -28,8 +28,8 @@ struct testBoolLiteral {
 // CHECK:           %[[VAL_2:.*]] = cc.alloca i1
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i1>
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_2]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i1>
-// CHECK:           return %[[VAL_5]] : i1
+// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_2]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i1>
+// CHECK:           return %[[VAL_4]] : i1
 
diff --git a/test/AST-Quake/bug_3270.cpp b/test/AST-Quake/bug_3270.cpp
index e03f5cacafb..d6ae42c199f 100644
--- a/test/AST-Quake/bug_3270.cpp
+++ b/test/AST-Quake/bug_3270.cpp
@@ -26,8 +26,21 @@ __qpu__ void foo() {
 // CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][2] : (!quake.veq<3>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_3]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] name "result%[[VAL_0]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_2]] name "result%[[VAL_1]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_3]] name "result%[[VAL_2]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 3>
+// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] name "result%[[VAL_0]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
+// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
+// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] name "result%[[VAL_1]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
+// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_12:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i8
+// CHECK:           cc.store %[[VAL_12]], %[[VAL_11]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = quake.mz %[[VAL_3]] name "result%[[VAL_2]]" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_14:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
+// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_4]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
+// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-Quake/call_qpu.cpp b/test/AST-Quake/call_qpu.cpp
index e766ad62137..bd6d3f1ce6d 100644
--- a/test/AST-Quake/call_qpu.cpp
+++ b/test/AST-Quake/call_qpu.cpp
@@ -12,14 +12,14 @@
 
 std::vector<bool> func_achat(cudaq::qview<> &qv) __qpu__ {
   // measure the entire register
-  return cudaq::to_bool_vector(mz(qv));
+  return mz(qv);
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_func_achat._Z10func_achatRN5cudaq5qviewILm2EEE(
 // CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", no_this} {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 // CHECK:           %[[VAL_6:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_5]], %[[VAL_1]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
@@ -52,21 +52,21 @@ int func_shiim(cudaq::qvector<> &qv) __qpu__ {
 // CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 // CHECK:           call @__nvqpp_vectorCopyToStack(%[[VAL_10]], %[[VAL_7]], %[[VAL_8]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64) -> ()
 // CHECK:           %[[VAL_11:.*]] = cc.undef i32
-// CHECK:           %[[VAL_12:.*]]:2 = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_2]], %[[VAL_14:.*]] = %[[VAL_11]]) -> (i64, i32)) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_8]] : i64
-// CHECK:             cc.condition %[[VAL_15]](%[[VAL_13]], %[[VAL_14]] : i64, i32)
+// CHECK:           %[[VAL_13:.*]]:2 = cc.loop while ((%[[VAL_14:.*]] = %[[VAL_2]], %[[VAL_15:.*]] = %[[VAL_11]]) -> (i64, i32)) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_8]] : i64
+// CHECK:             cc.condition %[[VAL_16]](%[[VAL_14]], %[[VAL_15]] : i64, i32)
 // CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_16:.*]]: i64, %[[VAL_17:.*]]: i32):
-// CHECK:             %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_16]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_19:.*]] = cc.load %[[VAL_18]] : !cc.ptr<i8>
-// CHECK:             %[[VAL_20:.*]] = cc.cast %[[VAL_19]] : (i8) -> i1
-// CHECK:             %[[VAL_21:.*]] = cc.if(%[[VAL_20]]) -> i32 {
-// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_17]], %[[VAL_3]] : i32
+// CHECK:           ^bb0(%[[VAL_17:.*]]: i64, %[[VAL_18:.*]]: i32):
+// CHECK:             %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_9]][%[[VAL_17]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:             %[[VAL_20:.*]] = cc.load %[[VAL_19]] : !cc.ptr<i8>
+// CHECK:             %[[VAL_12:.*]] = cc.cast %[[VAL_20]] : (i8) -> i1
+// CHECK:             %[[VAL_21:.*]] = cc.if(%[[VAL_12]]) -> i32 {
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_18]], %[[VAL_3]] : i32
 // CHECK:               cc.continue %[[VAL_22]] : i32
 // CHECK:             } else {
-// CHECK:               cc.continue %[[VAL_17]] : i32
+// CHECK:               cc.continue %[[VAL_18]] : i32
 // CHECK:             }
-// CHECK:             cc.continue %[[VAL_16]], %[[VAL_23:.*]] : i64, i32
+// CHECK:             cc.continue %[[VAL_17]], %[[VAL_23:.*]] : i64, i32
 // CHECK:           } step {
 // CHECK:           ^bb0(%[[VAL_24:.*]]: i64, %[[VAL_25:.*]]: i32):
 // CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_24]], %[[VAL_1]] : i64
@@ -132,7 +132,6 @@ void func_arba() __qpu__ {
 // CHECK:               %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i64
 // CHECK:               cc.continue %[[VAL_19]] : i64
 // CHECK:             } {invariant}
-// CHECK:           } else {
 // CHECK:           }
 // CHECK:           %[[VAL_20:.*]] = cc.loop while ((%[[VAL_21:.*]] = %[[VAL_2]]) -> (i64)) {
 // CHECK:             %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_0]] : i64
diff --git a/test/AST-Quake/cast.cpp b/test/AST-Quake/cast.cpp
index 7ff33801582..39d9d820135 100644
--- a/test/AST-Quake/cast.cpp
+++ b/test/AST-Quake/cast.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake %s | FileCheck %s
 
 #include <cudaq.h>
 
@@ -14,7 +14,7 @@ struct testCast {
   void operator()() __qpu__ {
     cudaq::qubit q0, q1;
     h(q0);
-    double bit = static_cast<double>(mz(q0));
+    double bit = mz(q0);
     // This tests implicit casting from double to bool
     if (bit)
       x(q1);
@@ -29,16 +29,16 @@ struct testCast {
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
 // CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> f64
-// CHECK:           %[[VAL_6:.*]] = cc.alloca f64
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<f64>
-// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<f64>
-// CHECK:           %[[VAL_8:.*]] = arith.cmpf une, %[[VAL_7]], %[[VAL_0]] : f64
-// CHECK:           cc.if(%[[VAL_8]]) {
+// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_3]] :
+// CHECK:           %[[VAL_4:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> f64
+// CHECK:           %[[VAL_5:.*]] = cc.alloca f64
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_7:.*]] = arith.cmpf une, %[[VAL_6]], %[[VAL_0]] : f64
+// CHECK:           cc.if(%[[VAL_7]]) {
 // CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -58,24 +58,24 @@ struct testCastBoolMeasurement {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testCastBoolMeasurement() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_1_i32:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i1
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_7:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i32
-// CHECK:           %[[VAL_8:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_7]], %[[VAL_8]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_9:.*]] = cc.load %[[VAL_8]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_10]]) {
-// CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
+// CHECK:           quake.h %[[VAL_0]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_m0:.*]] = quake.mz %[[VAL_0]] name {{.*}} : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_m0]] :
+// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_5:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i32
+// CHECK:           %[[VAL_6:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7:.*]], %[[VAL_1_i32]] : i32
+// CHECK:           cc.if(%[[VAL_8]]) {
+// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m1:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -94,16 +94,16 @@ struct testUnsignedCastBoolConstTrue {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testUnsignedCastBoolConstTrue() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_1_i32:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_1_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_1_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -122,16 +122,16 @@ struct testUnsignedCastBoolConstFalse {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testUnsignedCastBoolConstFalse() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_0_i32:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_0_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_0_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -150,16 +150,16 @@ struct testSignedCastBoolConstTrue {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testSignedCastBoolConstTrue() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_1_i32:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_1_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_1_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -178,48 +178,16 @@ struct testSignedCastBoolConstFalse {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__testSignedCastBoolConstFalse() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_4]]) {
-// CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           }
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           return
-// CHECK:         }
-// clang-format on
-
-struct testCastIntMeasurement {
-  void operator()() __qpu__ {
-    cudaq::qubit q0, q1;
-    h(q0);
-    int bit = static_cast<int>(mz(q0));
-    if (bit == 1)
-      x(q1);
-    mz(q1);
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__testCastIntMeasurement() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i32
-// CHECK:           %[[VAL_6:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_0]] : i32
-// CHECK:           cc.if(%[[VAL_8]]) {
-// CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_0_i32:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
+// CHECK:           cc.store %[[VAL_0_i32]], %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i32>
+// CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_2:.*]], %[[VAL_0_i32]] : i32
+// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_m:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/const_reference_extension.cpp b/test/AST-Quake/const_reference_extension.cpp
index a97f0990ec7..e64ff72b007 100644
--- a/test/AST-Quake/const_reference_extension.cpp
+++ b/test/AST-Quake/const_reference_extension.cpp
@@ -28,9 +28,8 @@ __qpu__ uint64_t foo() {
   return qubit_values_to_integer(results);
 }
 
-// clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_qubit_values_to_integer.
-// CHECK-SAME:      %[[VAL_0:.*]]: !quake.measurements<?>) -> i64 attributes
+// CHECK-SAME:      (%[[VAL_0:.*]]: !cc.stdvec<i1>) -> i64 attributes
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[VAL_3:.*]] = cc.alloca i64
@@ -40,28 +39,29 @@ __qpu__ uint64_t foo() {
 // CHECK:             cc.store %[[VAL_2]], %[[VAL_4]] : !cc.ptr<i64>
 // CHECK:             cc.loop while {
 // CHECK:               %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_6:.*]] = quake.measurements_size %[[VAL_0]] : (!quake.measurements<?>) -> i64
+// CHECK:               %[[VAL_6:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<i1>) -> i64
 // CHECK:               %[[VAL_7:.*]] = arith.cmpi ult, %[[VAL_5]], %[[VAL_6]] : i64
 // CHECK:               cc.condition %[[VAL_7]]
 // CHECK:             } do {
 // CHECK:               %[[VAL_8:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_9:.*]] = quake.get_measure %[[VAL_0]]{{\[}}%[[VAL_8]]] : (!quake.measurements<?>, i64) -> !quake.measure
-// CHECK:               %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_11:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i64
-// CHECK:               %[[VAL_12:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_13:.*]] = arith.shli %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:               %[[VAL_14:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_15:.*]] = arith.ori %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:               cc.store %[[VAL_15]], %[[VAL_3]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_9:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_9]]{{\[}}%[[VAL_8]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+// CHECK:               %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_12:.*]] = cc.cast unsigned %{{.*}} : (i{{[18]}}) -> i64
+// CHECK-DAG:           %[[VAL_13:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_14:.*]] = arith.shli %[[VAL_12]], %[[VAL_13]] : i64
+// CHECK:               %[[VAL_15:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_16:.*]] = arith.ori %[[VAL_15]], %[[VAL_14]] : i64
+// CHECK:               cc.store %[[VAL_16]], %[[VAL_3]] : !cc.ptr<i64>
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_16:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-// CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_1]] : i64
-// CHECK:               cc.store %[[VAL_17]], %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_17:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
+// CHECK:               %[[VAL_18:.*]] = arith.addi %[[VAL_17]], %[[VAL_1]] : i64
+// CHECK:               cc.store %[[VAL_18]], %[[VAL_4]] : !cc.ptr<i64>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_18:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
-// CHECK:           return %[[VAL_18]] : i64
+// CHECK:           %[[VAL_19:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
+// CHECK:           return %[[VAL_19]] : i64
 // CHECK:         }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_foo.
@@ -82,9 +82,8 @@ __qpu__ uint64_t foo() {
 // CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_10]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_3]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:           %[[VAL_12:.*]] = quake.relax_size %[[VAL_11]] : (!quake.measurements<2>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_3]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_13:.*]] = call @__nvqpp__mlirgen__function_qubit_values_to_integer.
 // CHECK:           return %[[VAL_13]] : i64
 // CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/control_flow.cpp b/test/AST-Quake/control_flow.cpp
index 6b586b8941f..3e09ad848e8 100644
--- a/test/AST-Quake/control_flow.cpp
+++ b/test/AST-Quake/control_flow.cpp
@@ -103,7 +103,7 @@ struct C {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -192,7 +192,7 @@ struct D {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -280,7 +280,7 @@ struct E {
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb7:
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_6]] : !quake.veq<2>
 // CHECK:           return
 
@@ -370,6 +370,6 @@ struct F {
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb8:
 // CHECK:           call @_Z2g4v() : () -> ()
-// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_28:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_6]] : !quake.veq<2>
 // CHECK:           return
diff --git a/test/AST-Quake/ctrl_vector.cpp b/test/AST-Quake/ctrl_vector.cpp
index 626ef24238d..828eba75c10 100644
--- a/test/AST-Quake/ctrl_vector.cpp
+++ b/test/AST-Quake/ctrl_vector.cpp
@@ -29,7 +29,7 @@ struct lower_ctrl_as_qreg {
 // CHECK:           quake.h [%[[VAL_0]]] %[[VAL_2]] : (!quake.veq<4>, !quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.x [%[[VAL_0]]] %[[VAL_3]] : (!quake.veq<4>, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/cudaq_run.cpp b/test/AST-Quake/cudaq_run.cpp
index cd7f5c3175b..4c7f1a20c75 100644
--- a/test/AST-Quake/cudaq_run.cpp
+++ b/test/AST-Quake/cudaq_run.cpp
@@ -16,7 +16,7 @@ struct K9 {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(5);
     cudaq::qubit p;
-    return cudaq::to_bool_vector(mz(q));
+    return mz(q);
   }
 };
 
@@ -71,7 +71,7 @@ __qpu__ std::vector<bool> dyn_vec_test(int n) {
   cudaq::qvector qs(n);
   for (int i = 0; i < n; i++)
     FlipQubit{}(qs[i]);
-  return cudaq::to_bool_vector(mz(qs));
+  return mz(qs);
 }
 
 // A kernel with a measurement-branch-dependent result size
@@ -81,7 +81,7 @@ __qpu__ std::vector<bool> branch_vec_test() {
   bool b = mz(ctrl);
   int sz = b ? 2 : 4;
   cudaq::qvector data(sz);
-  return cudaq::to_bool_vector(mz(data));
+  return mz(data);
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__K9.run()
diff --git a/test/AST-Quake/cudaq_types.cpp b/test/AST-Quake/cudaq_types.cpp
index 2e77e08a48c..e4ef92cf2ea 100644
--- a/test/AST-Quake/cudaq_types.cpp
+++ b/test/AST-Quake/cudaq_types.cpp
@@ -36,7 +36,7 @@ struct Qernel0 {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__Qernel0()
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/grover.cpp b/test/AST-Quake/grover.cpp
index 2464674d380..b4ada4c8230 100644
--- a/test/AST-Quake/grover.cpp
+++ b/test/AST-Quake/grover.cpp
@@ -152,7 +152,7 @@ int main(int argc, char *argv[]) {
 // CHECK:               cc.store %[[VAL_36]], %[[VAL_30]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_37:.*]] = quake.mz %[[VAL_21]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_37:.*]] = quake.mz %[[VAL_21]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/if.cpp b/test/AST-Quake/if.cpp
index e64fe464be1..a24698c6c2c 100644
--- a/test/AST-Quake/if.cpp
+++ b/test/AST-Quake/if.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake %s | FileCheck %s
 
 #include <cudaq.h>
 
@@ -80,20 +80,20 @@ struct kernel_short_circuit_and {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<3>) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_6:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_0]] : i1
-// CHECK:           %[[VAL_7:.*]] = cc.if(%[[VAL_6]]) -> i1 {
+// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_10]] :
+// CHECK:           %[[VAL_5:.*]] = arith.cmpi eq, %[[VAL_4]], %[[VAL_0]] : i1
+// CHECK:           %[[VAL_6:.*]] = cc.if(%[[VAL_5]]) -> i1 {
 // CHECK:             cc.continue %[[VAL_0]] : i1
 // CHECK:           } else {
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:             cc.continue %[[VAL_10]] : i1
+// CHECK:             %[[VAL_7:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_7]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_81:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:             cc.continue %[[VAL_81]] : i1
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_11:.*]]) {
-// CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             quake.x %[[VAL_12]] : (!quake.ref) -> ()
+// CHECK:           cc.if(%[[VAL_6]]) {
+// CHECK:             %[[VAL_9:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_9]] : (!quake.ref) -> ()
 // CHECK:           }
 // CHECK:           return %[[VAL_1]] : i32
 // CHECK:         }
@@ -112,20 +112,20 @@ struct kernel_short_circuit_or {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]][0] : (!quake.veq<3>) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_6:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_0]] : i1
-// CHECK:           %[[VAL_7:.*]] = cc.if(%[[VAL_6]]) -> i1 {
-// CHECK:             cc.continue %[[VAL_6]] : i1
+// CHECK:           %[[VAL_41:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_41]] :
+// CHECK:           %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_4]], %[[VAL_0]] : i1
+// CHECK:           %[[VAL_6:.*]] = cc.if(%[[VAL_5]]) -> i1 {
+// CHECK:             cc.continue %[[VAL_5]] : i1
 // CHECK:           } else {
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:             cc.continue %[[VAL_10]] : i1
+// CHECK:             %[[VAL_7:.*]] = quake.extract_ref %[[VAL_2]][1] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_7]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_81:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:             cc.continue %[[VAL_81]] : i1
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_11:.*]]) {
-// CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             quake.x %[[VAL_12]] : (!quake.ref) -> ()
+// CHECK:           cc.if(%[[VAL_6]]) {
+// CHECK:             %[[VAL_9:.*]] = quake.extract_ref %[[VAL_2]][2] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             quake.x %[[VAL_9]] : (!quake.ref) -> ()
 // CHECK:           }
 // CHECK:           return %[[VAL_1]] : i32
 // CHECK:         }
@@ -143,15 +143,19 @@ struct kernel_ternary {
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<3>
 // CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_1]][0] : (!quake.veq<3>) -> !quake.ref
 // CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.if(%[[VAL_4]]) -> !quake.measure {
-// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:             cc.continue %[[VAL_7]] : !quake.measure
+// CHECK:           %[[VAL_31:.*]] = quake.discriminate %[[VAL_3]] :
+// CHECK:           %[[VAL_4:.*]] = cc.if(%[[VAL_31]]) -> i1 {
+// CHECK:             %[[VAL_5:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_61:.*]] = quake.discriminate %[[VAL_6]] :
+// CHECK:             cc.continue %[[VAL_61]] : i1
 // CHECK:           } else {
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_1]][2] : (!quake.veq<3>) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             cc.continue %[[VAL_9]] : !quake.measure
+// CHECK:             %[[VAL_7:.*]] = quake.extract_ref %[[VAL_1]][2] : (!quake.veq<3>) -> !quake.ref
+// CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_7]] : (!quake.ref) -> !quake.measure
+// CHECK:             %[[VAL_81:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:             cc.continue %[[VAL_81]] : i1
 // CHECK:           }
+// CHECK:           %[[VAL_9:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_9]] : !cc.ptr<i1>
 // CHECK:           return %[[VAL_0]] : i32
 // CHECK:         }
diff --git a/test/AST-Quake/indirect_callable.cpp b/test/AST-Quake/indirect_callable.cpp
index 9824d334ddd..80ebad2b583 100644
--- a/test/AST-Quake/indirect_callable.cpp
+++ b/test/AST-Quake/indirect_callable.cpp
@@ -36,7 +36,7 @@ void meanwhile_on_safari() {
 // CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_7:.*]] = cc.cast signed %[[VAL_6]] : (i32) -> i64
 // CHECK:           %[[VAL_8:.*]] = quake.alloca !quake.veq<?>[%[[VAL_7]] : i64]
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/loop_unroll-1.cpp b/test/AST-Quake/loop_unroll-1.cpp
index 24c4f3ba706..7f0bdd62e12 100644
--- a/test/AST-Quake/loop_unroll-1.cpp
+++ b/test/AST-Quake/loop_unroll-1.cpp
@@ -20,13 +20,21 @@ struct C {
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__C()
-// CHECK-DAG:       %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "singleQubit" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] name "myRegister%[[VAL_0]]" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] name "myRegister%[[VAL_1]]" : (!quake.ref) -> !quake.measure
+// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
+// CHECK-DAG:       %[[VAL_10:.*]] = quake.alloca !quake.ref
+// CHECK-DAG:       %[[VAL_11:.*]] = quake.mz %[[VAL_10]] name "singleQubit" : (!quake.ref) -> !quake.measure
+// CHECK-DAG:       %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 2>
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] name "myRegister%0" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_6]] : {{.*}} -> i1
+// CHECK:           %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_10]]
+// CHECK:           cc.store %[[VAL_14]], %{{.*}} : !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_7]] name "myRegister%1" : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_8]] :
+// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_11]]
+// CHECK:           cc.store %[[VAL_13]], %[[VAL_9]] : !cc.ptr<i8>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/loop_unroll-3.cpp b/test/AST-Quake/loop_unroll-3.cpp
index d73ea53bbf8..ef2ebdb3732 100644
--- a/test/AST-Quake/loop_unroll-3.cpp
+++ b/test/AST-Quake/loop_unroll-3.cpp
@@ -107,6 +107,6 @@ struct Qernel {
 // CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.x %[[VAL_3]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-Quake/measure_bell.cpp b/test/AST-Quake/measure_bell.cpp
index c2a5103ae7b..1471775ec87 100644
--- a/test/AST-Quake/measure_bell.cpp
+++ b/test/AST-Quake/measure_bell.cpp
@@ -32,10 +32,8 @@ struct bell {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__bell(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK-DAG:       %[[VAL_C1I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_C0I64:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
@@ -50,31 +48,33 @@ struct bell {
 // CHECK:               %[[VAL_9:.*]] = arith.cmpi slt, %[[VAL_7]], %[[VAL_8]] : i32
 // CHECK:               cc.condition %[[VAL_9]]
 // CHECK:             } do {
-// CHECK:               cc.scope {
-// CHECK:                 %[[VAL_10:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<2>) -> !quake.ref
-// CHECK:                 quake.h %[[VAL_10]] : (!quake.ref) -> ()
-// CHECK:                 %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
-// CHECK:                 quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:                 %[[VAL_12:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:                 %[[VAL_13:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C0I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:                 %[[VAL_14:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:                 %[[VAL_15:.*]] = cc.alloca i1
-// CHECK:                 cc.store %[[VAL_14]], %[[VAL_15]] : !cc.ptr<i1>
-// CHECK:                 %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i1>
-// CHECK:                 %[[VAL_17:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C1I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:                 %[[VAL_18:.*]] = quake.discriminate %[[VAL_17]] : (!quake.measure) -> i1
-// CHECK:                 %[[VAL_19:.*]] = arith.cmpi eq, %[[VAL_16]], %[[VAL_18]] : i1
-// CHECK:                 cc.if(%[[VAL_19]]) {
-// CHECK:                   %[[VAL_20:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                   %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i32
-// CHECK:                   cc.store %[[VAL_21]], %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                 }
+// CHECK:               %[[VAL_10:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<2>) -> !quake.ref
+// CHECK:               quake.h %[[VAL_10]] : (!quake.ref) -> ()
+// CHECK:               %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
+// CHECK:               quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:               %[[VAL_112:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:               %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:               %[[VAL_15:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_35:.*]] = cc.cast %[[VAL_15]] : (i8) -> i1
+// CHECK:               %[[VAL_16:.*]] = cc.alloca i1
+// CHECK:               cc.store %[[VAL_35]], %[[VAL_16]] : !cc.ptr<i1>
+// CHECK:               %[[VAL_17:.*]] = cc.load %[[VAL_16]] : !cc.ptr<i1>
+// CHECK:               %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:               %[[VAL_19:.*]] = cc.load %[[VAL_18]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_39:.*]] = cc.cast %[[VAL_19]] : (i8) -> i1
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_39]] : i1
+// CHECK:               cc.if(%[[VAL_20]]) {
+// CHECK:                 %[[VAL_21:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:                 %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i32
+// CHECK:                 cc.store %[[VAL_22]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:               }
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_22:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_1]] : i32
-// CHECK:               cc.store %[[VAL_23]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_23:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_1]] : i32
+// CHECK:               cc.store %[[VAL_24]], %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
@@ -99,10 +99,8 @@ struct libertybell {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__libertybell(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK-DAG:       %[[VAL_C1I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_C0I64:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
@@ -121,22 +119,24 @@ struct libertybell {
 // CHECK:               quake.h %[[VAL_10]] : (!quake.ref) -> ()
 // CHECK:               %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:               quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:               %[[VAL_12:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:               %[[VAL_13:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C0I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_14:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C1I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_15:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_16:.*]] = quake.discriminate %[[VAL_14]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_17:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_16]] : i1
-// CHECK:               cc.if(%[[VAL_17]]) {
-// CHECK:                 %[[VAL_18:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                 %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i32
-// CHECK:                 cc.store %[[VAL_19]], %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_112:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:               %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_17:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_16]] : i8
+// CHECK:               cc.if(%[[VAL_18]]) {
+// CHECK:                 %[[VAL_19:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_1]] : i32
+// CHECK:                 cc.store %[[VAL_20]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:               }
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_20:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i32
-// CHECK:               cc.store %[[VAL_21]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_21:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i32
+// CHECK:               cc.store %[[VAL_22]], %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
@@ -163,10 +163,8 @@ struct tinkerbell {
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__tinkerbell(
 // CHECK-SAME:      %[[VAL_0:.*]]: i32) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK-DAG:       %[[VAL_C1I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_C0I64:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
@@ -185,22 +183,24 @@ struct tinkerbell {
 // CHECK:               quake.h %[[VAL_10]] : (!quake.ref) -> ()
 // CHECK:               %[[VAL_11:.*]] = quake.extract_ref %[[VAL_4]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:               quake.x {{\[}}%[[VAL_10]]] %[[VAL_11]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:               %[[VAL_12:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:               %[[VAL_13:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C0I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_14:.*]] = quake.get_measure %[[VAL_12]][%[[VAL_C1I64]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:               %[[VAL_15:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_16:.*]] = quake.discriminate %[[VAL_14]] : (!quake.measure) -> i1
-// CHECK:               %[[VAL_17:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_16]] : i1
-// CHECK:               cc.if(%[[VAL_17]]) {
-// CHECK:                 %[[VAL_18:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:                 %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i32
-// CHECK:                 cc.store %[[VAL_19]], %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_112:.*]] = quake.mz %[[VAL_4]] name "results" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:               %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:               %[[VAL_14:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_13]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i8>
+// CHECK-DAG:           %[[VAL_17:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i8>
+// CHECK:               %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_16]] : i8
+// CHECK:               cc.if(%[[VAL_18]]) {
+// CHECK:                 %[[VAL_19:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
+// CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_1]] : i32
+// CHECK:                 cc.store %[[VAL_20]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:               }
 // CHECK:               cc.continue
 // CHECK:             } step {
-// CHECK:               %[[VAL_20:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_1]] : i32
-// CHECK:               cc.store %[[VAL_21]], %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_21:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i32>
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i32
+// CHECK:               cc.store %[[VAL_22]], %[[VAL_6]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
diff --git a/test/AST-Quake/measure_result_assign.cpp b/test/AST-Quake/measure_result_assign.cpp
deleted file mode 100644
index 114bb04260f..00000000000
--- a/test/AST-Quake/measure_result_assign.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: not cudaq-quake %s 2>&1 | FileCheck %s
-
-#include <cudaq.h>
-
-__qpu__ bool assign_kernel() {
-  cudaq::qvector q(2);
-  auto results = mz(q);
-  results[0] = mz(q[1]);
-  return static_cast<bool>(results[0]);
-}
-
-// CHECK: error:{{.*}}deleted operator '='
-// CHECK: error: C++ source has errors
diff --git a/test/AST-Quake/measure_result_compare.cpp b/test/AST-Quake/measure_result_compare.cpp
deleted file mode 100644
index a34ee2f2d93..00000000000
--- a/test/AST-Quake/measure_result_compare.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
-
-#include <cudaq.h>
-
-__qpu__ int compare_kernel() {
-  cudaq::qvector q(2);
-  cudaq::measure_result a = mz(q[0]);
-  cudaq::measure_result b = mz(q[1]);
-  if (a == b)
-    return 1;
-  if (a != b)
-    return 0;
-  return -1;
-}
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_compare_kernel._Z14compare_kernelv() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant -1 : i32
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_4]] name "a" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<2>) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] name "b" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_9]] : i1
-// CHECK:           cc.if(%[[VAL_10]]) {
-// CHECK:             cc.unwind_return %[[VAL_1]] : i32
-// CHECK:           }
-// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_13:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_12]] : i1
-// CHECK:           cc.if(%[[VAL_13]]) {
-// CHECK:             cc.unwind_return %[[VAL_2]] : i32
-// CHECK:           }
-// CHECK:           return %[[VAL_0]] : i32
-// CHECK:         }
-// clang-format on
-
-__qpu__ int compare_with_bool_kernel() {
-  cudaq::qubit q;
-  cudaq::measure_result a = mz(q);
-  if (a == true)
-    return 1;
-  if (a != false)
-    return 2;
-  return 0;
-}
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_compare_with_bool_kernel._Z24compare_with_bool_kernelv() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i32
-// CHECK:           %[[VAL_2:.*]] = arith.constant false
-// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_4:.*]] = arith.constant true
-// CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] name "a" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_4]] : i1
-// CHECK:           cc.if(%[[VAL_8]]) {
-// CHECK:             cc.unwind_return %[[VAL_3]] : i32
-// CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_2]] : i1
-// CHECK:           cc.if(%[[VAL_10]]) {
-// CHECK:             cc.unwind_return %[[VAL_1]] : i32
-// CHECK:           }
-// CHECK:           return %[[VAL_0]] : i32
-// CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/measure_result_device_entry.cpp b/test/AST-Quake/measure_result_device_entry.cpp
deleted file mode 100644
index 3c1a56e3119..00000000000
--- a/test/AST-Quake/measure_result_device_entry.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
-
-#include <cudaq.h>
-
-__qpu__ cudaq::measure_result device_helper(cudaq::qubit &q) {
-  h(q);
-  return mz(q);
-}
-
-__qpu__ bool entry_kernel() {
-  cudaq::qubit q;
-  auto m = device_helper(q);
-  return static_cast<bool>(m);
-}
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_device_helper
-// CHECK-SAME:      (%[[VAL_0:.*]]: !quake.ref) -> !quake.measure attributes {"cudaq-kernel", no_this} {
-// CHECK:           quake.h %[[VAL_0]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
-// CHECK:           return %[[VAL_1]] : !quake.measure
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_entry_kernel._Z12entry_kernelv() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_1:.*]] = call @__nvqpp__mlirgen__function_device_helper{{.*}}(%[[VAL_0]]) : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_2]] : i1
-// CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/mz.cpp b/test/AST-Quake/mz.cpp
index d3e31898df8..b7a5fe29463 100644
--- a/test/AST-Quake/mz.cpp
+++ b/test/AST-Quake/mz.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake %s | FileCheck %s
 
 #include <cudaq.h>
 
@@ -18,67 +18,40 @@ struct S {
 };
 
 // clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__S() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<20>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<20>) -> !quake.measurements<20>
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__S() attributes
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<20>
+// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<20>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
 
 struct VectorOfStaticVeq {
-  std::vector<cudaq::measure_result> operator()() __qpu__ {
-    cudaq::qubit q1;
-    cudaq::qvector reg1(4);
-    cudaq::qvector reg2(2);
-    cudaq::qubit q2;
-    return mz(q1, reg1, reg2, q2);
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfStaticVeq() -> !quake.measurements<?> 
-// CHECK-NOT: cudaq-entrypoint
-// CHECK-DAG:       %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
-// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
-// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<8>
-// CHECK:           %[[VAL_5:.*]] = quake.relax_size %[[VAL_4]] : (!quake.measurements<8>) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_5]] : !quake.measurements<?>
-// CHECK:         }
-// clang-format on
-
-struct VectorOfStaticVeq_Bool {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qubit q1;
     cudaq::qvector reg1(4);
     cudaq::qvector reg2(2);
     cudaq::qubit q2;
-    auto res = mz(q1, reg1, reg2, q2);
-    return cudaq::to_bool_vector(res);
+    return mz(q1, reg1, reg2, q2);
   }
 };
 
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfStaticVeq_Bool() -> !cc.stdvec<i1>
-// CHECK-SAME:      attributes {"cudaq-entrypoint"
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.veq<4>
-// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK-DAG:       %[[VAL_4:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] name "res" : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<8>
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measurements<8>) -> !cc.stdvec<i1>
-// CHECK:           %[[VAL_7:.*]] = cc.stdvec_data %[[VAL_6]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.stdvec_size %[[VAL_6]] : (!cc.stdvec<i1>) -> i64
-// CHECK:           %[[VAL_9:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_7]], %[[VAL_8]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.stdvec_init %[[VAL_9]], %[[VAL_8]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_10]] : !cc.stdvec<i1>
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfStaticVeq() -> !cc.stdvec<i1> attributes {
+// CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<4>
+// CHECK:           %[[VAL_6:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_81:.*]] = quake.mz %[[VAL_0]], %[[VAL_3]], %[[VAL_6]], %[[VAL_7]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_81]] :
+// CHECK:           %[[VAL_9:.*]] = cc.stdvec_data %[[VAL_8]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_10:.*]] = cc.stdvec_size %[[VAL_8]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_12:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_9]], %[[VAL_10]], %[[VAL_11]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_10]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_13]] : !cc.stdvec<i1>
 // CHECK:         }
-// clang-format on
 
 struct VectorOfDynamicVeq {
-  std::vector<cudaq::measure_result> operator()(unsigned i, unsigned j) __qpu__ {
+  std::vector<bool> operator()(unsigned i, unsigned j) __qpu__ {
     cudaq::qubit q1;
     cudaq::qvector reg1(i);
     cudaq::qvector reg2(j);
@@ -87,11 +60,9 @@ struct VectorOfDynamicVeq {
   }
 };
 
-// clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorOfDynamicVeq(
-// CHECK-SAME:                                                    %[[VAL_0:.*]]: i32,
-// CHECK-SAME:                                                    %[[VAL_1:.*]]: i32) -> !quake.measurements<?> attributes {"cudaq-kernel"} {
-// CHECK-NOT: cudaq-entrypoint
+// CHECK-SAME:        %[[VAL_0:.*]]: i32{{.*}}, %[[VAL_1:.*]]: i32{{.*}}) -> !cc.stdvec<i1> attributes {
+// CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = cc.alloca i32
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_3:.*]] = cc.alloca i32
@@ -99,50 +70,17 @@ struct VectorOfDynamicVeq {
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_5]] : (i32) -> i64
-// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_6]] : i64]
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<?>[%[[VAL_6]] : i64]
 // CHECK:           %[[VAL_8:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_9:.*]] = cc.cast unsigned %[[VAL_8]] : (i32) -> i64
-// CHECK:           %[[VAL_10:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_9]] : i64]
+// CHECK:           %[[VAL_10:.*]] = quake.alloca !quake.veq<?>[%[[VAL_9]] : i64]
 // CHECK:           %[[VAL_11:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_12:.*]] = quake.mz %[[VAL_4]], %[[VAL_7]], %[[VAL_10]], %[[VAL_11]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_12]] : !quake.measurements<?>
+// CHECK:           %[[VAL_112:.*]] = quake.mz %[[VAL_4]], %[[VAL_7]], %[[VAL_10]], %[[VAL_11]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_112]] :
+// CHECK:           %[[VAL_13:.*]] = cc.stdvec_data %[[VAL_12]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_14:.*]] = cc.stdvec_size %[[VAL_12]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_16:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_17:.*]] = cc.stdvec_init %[[VAL_16]], %[[VAL_14]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_17]] : !cc.stdvec<i1>
 // CHECK:         }
-// clang-format on
 
-struct MxTest {
-  void operator()() __qpu__ {
-    cudaq::qubit q;
-    auto r = mx(q);
-    bool b = r;
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__MxTest() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_1:.*]] = quake.mx %[[VAL_0]] name "r" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
-// CHECK:           return
-// CHECK:         }
-// clang-format on
-
-struct MyTest {
-  void operator()() __qpu__ {
-    cudaq::qubit q;
-    auto r = my(q);
-    bool b = r;
-  }
-};
-
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__MyTest() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_1:.*]] = quake.my %[[VAL_0]] name "r" : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
-// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
-// CHECK:           return
-// CHECK:         }
-// clang-format on
diff --git a/test/AST-Quake/qalloc_initialization.cpp b/test/AST-Quake/qalloc_initialization.cpp
index 7ddacc59eeb..ca4ba11510b 100644
--- a/test/AST-Quake/qalloc_initialization.cpp
+++ b/test/AST-Quake/qalloc_initialization.cpp
@@ -19,7 +19,7 @@ struct Vanilla {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v{cudaq::state{0., 1., 1., 0.}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -54,7 +54,7 @@ struct VanillaBean {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v = cudaq::state{0., 1., 1., 0.};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -91,7 +91,7 @@ struct Cherry {
     cudaq::qvector v{{std::initializer_list<std::complex<double>>{
         {0.0, 1.0}, {0.6, 0.4}, {1.0, 0.0}, {0.0, 0.0}}}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -131,7 +131,7 @@ struct MooseTracks {
         {std::complex<double>{0.0, 1.0}, std::complex<double>{0.75, 0.25},
          std::complex<double>{1.0, 0.0}, std::complex<double>{0.0, 0.0}}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -170,7 +170,7 @@ struct RockyRoad {
     cudaq::qvector v{cudaq::state{0.0 + 1.0i, std::complex<double>{0.8, 0.2},
                                   1.0 + 0.0i, std::complex<double>{0.0, 0.0}}};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -285,7 +285,7 @@ struct Neapolitan {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v{getComplexInit()};
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -317,7 +317,7 @@ struct ButterPecan {
   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector v(getComplexInit());
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
diff --git a/test/AST-Quake/qalloc_state.cpp b/test/AST-Quake/qalloc_state.cpp
index 5180e9000f3..1d4d53f0e38 100644
--- a/test/AST-Quake/qalloc_state.cpp
+++ b/test/AST-Quake/qalloc_state.cpp
@@ -14,7 +14,7 @@ struct Eins {
   std::vector<bool> operator()(cudaq::state *state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -28,7 +28,7 @@ struct Zwei {
   std::vector<bool> operator()(const cudaq::state *state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -42,7 +42,7 @@ struct Drei {
   std::vector<bool> operator()(cudaq::state &state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
@@ -56,7 +56,7 @@ struct Vier {
   std::vector<bool> operator()(const cudaq::state &state) __qpu__ {
     cudaq::qvector v(state);
     h(v);
-    return cudaq::to_bool_vector(mz(v));
+    return mz(v);
   }
 };
 
diff --git a/test/AST-Quake/qpe.cpp b/test/AST-Quake/qpe.cpp
index 9012aebb114..5249c5aecc9 100644
--- a/test/AST-Quake/qpe.cpp
+++ b/test/AST-Quake/qpe.cpp
@@ -329,7 +329,7 @@ int main() {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           call @__nvqpp__mlirgen__function_iqft{{.*}}(%[[VAL_20]]) : (!quake.veq<?>) -> ()
-// CHECK:           %[[VAL_54:.*]] = quake.mz %[[VAL_20]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_54:.*]] = quake.mz %[[VAL_20]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/reset_after_measure.cpp b/test/AST-Quake/reset_after_measure.cpp
index 0e0d80227e9..15de755fea3 100644
--- a/test/AST-Quake/reset_after_measure.cpp
+++ b/test/AST-Quake/reset_after_measure.cpp
@@ -79,7 +79,10 @@ void reuse2() __qpu__ {
 // CHECK:           cc.if(%[[VAL_2]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_2]]) {
+// CHECK:           %[[VAL_3:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i1>
+// CHECK:           cc.if(%[[VAL_4]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
 // clang-format on
@@ -109,7 +112,10 @@ void reuse3() __qpu__ {
 // CHECK:           cc.if(%[[VAL_3]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:           }
-// CHECK:           cc.if(%[[VAL_3]]) {
+// CHECK:           %[[VAL_4:.*]] = cc.alloca i1
+// CHECK:           cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i1>
+// CHECK:           cc.if(%[[VAL_5]]) {
 // CHECK:             quake.x %[[VAL_0]] : (!quake.ref) -> ()
 // CHECK:             quake.x %[[VAL_1]] : (!quake.ref) -> ()
 // CHECK:           }
diff --git a/test/AST-Quake/separate_compilation.cpp b/test/AST-Quake/separate_compilation.cpp
index f7ab183aa3f..5ea64335439 100644
--- a/test/AST-Quake/separate_compilation.cpp
+++ b/test/AST-Quake/separate_compilation.cpp
@@ -18,14 +18,6 @@ __qpu__ uint64_t test_entry_point() {
   return otherKernel(results);
 }
 
-// clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_entry_point._Z16test_entry_pointv() -> i64
-// CHECK-SAME:      attributes {"cudaq-entrypoint", "cudaq-kernel", no_this}
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<5>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] name "results" : (!quake.veq<5>) -> !quake.measurements<5>
-// CHECK:           %[[VAL_2:.*]] = quake.relax_size %[[VAL_1]] : (!quake.measurements<5>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_3:.*]] = call @{{.*otherKernel.*}}(%[[VAL_2]]) : (!quake.measurements<?>) -> i64
-// CHECK:           return %[[VAL_3]] : i64
-// CHECK:         }
-// CHECK:         func.func private @{{.*otherKernel.*}}(!quake.measurements<?>) -> i64
-// clang-format on
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_entry_point.
+// CHECK-SAME:      () -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_3:.*]] = call @__nvqpp__mlirgen__function_otherKernel.{{.*}}(%{{.*}}) : (!cc.stdvec<i1>) -> i64
diff --git a/test/AST-Quake/simple.cpp b/test/AST-Quake/simple.cpp
index c76455540b6..165c1fe5deb 100644
--- a/test/AST-Quake/simple.cpp
+++ b/test/AST-Quake/simple.cpp
@@ -61,7 +61,7 @@ struct ghz {
 // CHECK:               cc.store %[[VAL_21]], %[[VAL_8]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_22:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_22:.*]] = quake.mz %[[VAL_6]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/simple_qarray.cpp b/test/AST-Quake/simple_qarray.cpp
index b6552564ef4..7ada6b1ee8f 100644
--- a/test/AST-Quake/simple_qarray.cpp
+++ b/test/AST-Quake/simple_qarray.cpp
@@ -74,7 +74,7 @@ int main() {
 // CHECK:               cc.store %[[VAL_16]], %[[VAL_5]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
-// CHECK:           %[[VAL_17:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<5>) -> !quake.measurements<5>
+// CHECK:           %[[VAL_17:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/AST-Quake/slice.cpp b/test/AST-Quake/slice.cpp
index 90c9e90b64d..c7bbc85e505 100644
--- a/test/AST-Quake/slice.cpp
+++ b/test/AST-Quake/slice.cpp
@@ -38,13 +38,15 @@ __qpu__ bool issue_3092() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_issue_3092._Z10issue_3092v() -> i1
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<6>
-// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_1]][3] : (!quake.veq<6>) -> !quake.ref
-// CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = quake.subveq %[[VAL_1]], 3, 3 : (!quake.veq<6>) -> !quake.veq<1>
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !quake.measurements<1>
-// CHECK:           %[[VAL_5:.*]] = quake.get_measure %[[VAL_4]][%[[VAL_0]]] : (!quake.measurements<1>, i64) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_6]] : i1
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<6>
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][3] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_2:.*]] = quake.subveq %[[VAL_0]], 3, 3 : (!quake.veq<6>) -> !quake.veq<1>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_4]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_8:.*]] = cc.cast %[[VAL_7]] : (i8) -> i1
+// CHECK:           return %[[VAL_8]] : i1
 // CHECK:         }
diff --git a/test/AST-Quake/ternary.cpp b/test/AST-Quake/ternary.cpp
index 74c437d2392..375b9645d12 100644
--- a/test/AST-Quake/ternary.cpp
+++ b/test/AST-Quake/ternary.cpp
@@ -36,68 +36,15 @@ int main() {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_test_kernel._Z11test_kerneli(
-// CHECK-SAME:                                                                       %[[VAL_0:.*]]: i32) -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : i32
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_7:.*]] = cc.cast signed %[[VAL_6]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_7]] : i64]
-// CHECK:           %[[VAL_9:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<?>) -> !quake.ref
-// CHECK:           quake.h %[[VAL_9]] : (!quake.ref) -> ()
-// CHECK:           cc.scope {
-// CHECK:             %[[VAL_10:.*]] = cc.alloca i32
-// CHECK:             cc.store %[[VAL_4]], %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:             cc.loop while {
-// CHECK:               %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_12:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_13:.*]] = arith.subi %[[VAL_12]], %[[VAL_3]] : i32
-// CHECK:               %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_11]], %[[VAL_13]] : i32
-// CHECK:               cc.condition %[[VAL_14]]
-// CHECK:             } do {
-// CHECK:               %[[VAL_15:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_16:.*]] = cc.cast signed %[[VAL_15]] : (i32) -> i64
-// CHECK:               %[[VAL_17:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_16]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:               %[[VAL_18:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_3]] : i32
-// CHECK:               %[[VAL_20:.*]] = cc.cast signed %[[VAL_19]] : (i32) -> i64
-// CHECK:               %[[VAL_21:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_20]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:               quake.x {{\[}}%[[VAL_17]]] %[[VAL_21]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:               cc.continue
-// CHECK:             } step {
-// CHECK:               %[[VAL_22:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i32>
-// CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_3]] : i32
-// CHECK:               cc.store %[[VAL_23]], %[[VAL_10]] : !cc.ptr<i32>
+// CHECK-DAG:        %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK-DAG:        %[[VAL_4:.*]] = arith.constant 0 : i32
+// CHECK:             cc.scope {
+// CHECK:               %[[VAL_35:.*]] = cc.alloca i1
+// CHECK:               %[[VAL_36:.*]] = cc.load %[[VAL_35]] : !cc.ptr<i1>
+// CHECK:               %[[VAL_37:.*]] = cc.if(%[[VAL_36]]) -> i32 {
+// CHECK:                 cc.continue %[[VAL_3]] : i32
+// CHECK:               } else {
+// CHECK:                 cc.continue %[[VAL_4]] : i32
+// CHECK:               }
 // CHECK:             }
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = quake.mz %[[VAL_8]] name "results" : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_25:.*]] = cc.alloca i32
-// CHECK:           cc.store %[[VAL_4]], %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:           %[[VAL_26:.*]] = quake.veq_size %[[VAL_8]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_27:.*]] = cc.loop while ((%[[VAL_28:.*]] = %[[VAL_2]]) -> (i64)) {
-// CHECK:             %[[VAL_29:.*]] = arith.cmpi slt, %[[VAL_28]], %[[VAL_26]] : i64
-// CHECK:             cc.condition %[[VAL_29]](%[[VAL_28]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_30:.*]]: i64):
-// CHECK:             %[[VAL_31:.*]] = quake.get_measure %[[VAL_24]]{{\[}}%[[VAL_30]]] : (!quake.measurements<?>, i64) -> !quake.measure
-// CHECK:             %[[VAL_32:.*]] = quake.discriminate %[[VAL_31]] : (!quake.measure) -> i1
-// CHECK:             %[[VAL_33:.*]] = cc.if(%[[VAL_32]]) -> i32 {
-// CHECK:               cc.continue %[[VAL_3]] : i32
-// CHECK:             } else {
-// CHECK:               cc.continue %[[VAL_4]] : i32
-// CHECK:             }
-// CHECK:             %[[VAL_34:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_34]], %[[VAL_33]] : i32
-// CHECK:             cc.store %[[VAL_35]], %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:             cc.continue %[[VAL_30]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_36:.*]]: i64):
-// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_36]], %[[VAL_1]] : i64
-// CHECK:             cc.continue %[[VAL_37]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_38:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i32>
-// CHECK:           return %[[VAL_38]] : i32
-// CHECK:         }
+// CHECK:             cc.continue %{{.*}} : i64
diff --git a/test/AST-Quake/to_integer.cpp b/test/AST-Quake/to_integer.cpp
index 14dbddb4db8..af4b217c45e 100644
--- a/test/AST-Quake/to_integer.cpp
+++ b/test/AST-Quake/to_integer.cpp
@@ -23,17 +23,7 @@ struct kernel {
   }
 };
 
-struct kernel_via_bool_vector {
-  void operator()() __qpu__ {
-    cudaq::qvector q(4);
-    int64_t results_int =
-        cudaq::to_integer(cudaq::to_bool_vector(mz(q)));
-    external_call_to_keep_result(results_int);
-  }
-};
-
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
 // CHECK-NOT: llvm.vector
-// CHECK-LABEL: define void @__nvqpp__mlirgen__kernel_via_bool_vector()
-// CHECK-NOT: llvm.vector
+
diff --git a/test/AST-Quake/to_qir.cpp b/test/AST-Quake/to_qir.cpp
index 7248a7c6229..e09c4998bec 100644
--- a/test/AST-Quake/to_qir.cpp
+++ b/test/AST-Quake/to_qir.cpp
@@ -33,33 +33,34 @@ struct kernel {
 
 // clang-format off
 // CHECK-LABEL: define void @__nvqpp__mlirgen__kernel()
-// CHECK:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 1)
-// CHECK:         %[[VAL_4:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_2]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 2)
-// CHECK:         %[[VAL_6:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_5]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_4]], %[[VAL_3]]* %[[VAL_6]])
-// CHECK:         %[[VAL_7:.*]] = tail call %[[VAL_3]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_1]]* %[[VAL_0]], i64 0)
-// CHECK:         %[[VAL_8:.*]] = load %[[VAL_3]]*, %[[VAL_3]]** %[[VAL_7]], align 8
-// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_1]]*, %[[VAL_3]]*)* @__quantum__qis__x__ctl to i8*), %[[VAL_3]]* %[[VAL_8]], %[[VAL_3]]* %[[VAL_4]])
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_3]]* %[[VAL_8]])
-// CHECK:         %[[VAL_9:.*]] = tail call %[[VAL_10:.*]]* @__quantum__qis__mz__to__register(%[[VAL_3]]* %[[VAL_8]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
-// CHECK:         %[[VAL_11:.*]] = tail call %[[VAL_10]]* @__quantum__qis__mz__to__register(%[[VAL_3]]* %[[VAL_4]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
-// CHECK:         %[[VAL_12:.*]] = bitcast %[[VAL_10]]* %[[VAL_11]] to i1*
-// CHECK:         %[[VAL_13:.*]] = load i1, i1* %[[VAL_12]], align 1
-// CHECK:         br i1 %[[VAL_13]], label %[[VAL_14:.*]], label %[[VAL_15:.*]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         tail call void @__quantum__qis__x(%[[VAL_3]]* %[[VAL_6]])
-// CHECK:         br label %[[VAL_15]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         %[[VAL_16:.*]] = bitcast %[[VAL_10]]* %[[VAL_9]] to i1*
-// CHECK:         %[[VAL_17:.*]] = load i1, i1* %[[VAL_16]], align 1
-// CHECK:         br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_19:.*]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         tail call void @__quantum__qis__z(%[[VAL_3]]* %[[VAL_6]])
-// CHECK:         br label %[[VAL_19]]
-// CHECK:       {{[0-9]+}}:
-// CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_1]]* %[[VAL_0]])
+// CHECK:         %[[VAL_0:.*]] = tail call %Array* @__quantum__rt__qubit_allocate_array(i64 3)
+// CHECK:         %[[VAL_2:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 1)
+// CHECK:         %[[VAL_4:.*]] = load %Qubit*, %Qubit** %[[VAL_2]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_4]])
+// CHECK:         %[[VAL_5:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 2)
+// CHECK:         %[[VAL_6:.*]] = load %Qubit*, %Qubit** %[[VAL_5]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_4]], %Qubit* %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_0]], i64 0)
+// CHECK:         %[[VAL_8:.*]] = load %Qubit*, %Qubit** %[[VAL_7]], align 8
+// CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), %Qubit* %[[VAL_8]], %Qubit* %[[VAL_4]])
+// CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_8]])
+// CHECK:         %[[VAL_9:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_8]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623000, i64 0, i64 0))
+// CHECK:         %[[VAL_11:.*]] = bitcast %Result* %[[VAL_9]] to i1*
+// CHECK:         %[[VAL_12:.*]] = load i1, i1* %[[VAL_11]], align 1
+// CHECK:         %[[VAL_13:.*]] = tail call %Result* @__quantum__qis__mz__to__register(%Qubit* %[[VAL_4]], i8* nonnull getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.623100, i64 0, i64 0))
+// CHECK:         %[[VAL_14:.*]] = bitcast %Result* %[[VAL_13]] to i1*
+// CHECK:         %[[VAL_15:.*]] = load i1, i1* %[[VAL_14]], align 1
+// CHECK:         br i1 %[[VAL_15]], label %[[VAL_16:.*]], label %[[VAL_17:.*]]
+// CHECK:       14:                                               ; preds = %[[VAL_18:.*]]
+// CHECK:         tail call void @__quantum__qis__x(%Qubit* %[[VAL_6]])
+// CHECK:         br label %[[VAL_17]]
+// CHECK:       15:                                               ; preds = %[[VAL_16]], %[[VAL_18]]
+// CHECK:         br i1 %[[VAL_12]], label %[[VAL_19:.*]], label %[[VAL_20:.*]]
+// CHECK:       16:                                               ; preds = %[[VAL_17]]
+// CHECK:         tail call void @__quantum__qis__z(%Qubit* %[[VAL_6]])
+// CHECK:         br label %[[VAL_20]]
+// CHECK:       17:                                               ; preds = %[[VAL_19]], %[[VAL_17]]
+// CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_0]])
 // CHECK:         ret void
 // CHECK:       }
+
diff --git a/test/AST-Quake/tuple-0.cpp b/test/AST-Quake/tuple-0.cpp
index 8b32b0ada52..1480c42f677 100644
--- a/test/AST-Quake/tuple-0.cpp
+++ b/test/AST-Quake/tuple-0.cpp
@@ -23,7 +23,7 @@ struct ArithmeticTupleQernel {
 // CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{[[TUP]]}{{.*}}>
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{[[TUP]]}{{.*}}>>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -41,7 +41,7 @@ struct ArithmeticPairQernel {
 // CHECK:           %[[VAL_1:.*]] = cc.alloca !cc.struct<{f32, i32} [64,4]>
 // CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<!cc.struct<{f32, i32} [64,4]>>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -79,7 +79,7 @@ struct ArithmeticTupleQernelWithUse {
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_15]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -117,7 +117,7 @@ struct ArithmeticTupleQernelWithUse0 {
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_15]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_16:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -138,7 +138,7 @@ struct ArithmeticPairQernelWithUse {
 // CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = cc.cast signed %[[VAL_3]] : (i32) -> i64
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_4]] : i64]
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/vector-0.cpp b/test/AST-Quake/vector-0.cpp
index 2fb78bd51ef..93ea14a6949 100644
--- a/test/AST-Quake/vector-0.cpp
+++ b/test/AST-Quake/vector-0.cpp
@@ -45,7 +45,7 @@ struct simple_double_rotation {
 // CHECK:           %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<f64>
 // CHECK:           %[[VAL_13:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.rx (%[[VAL_12]]) %[[VAL_13]] : (f64, !quake.ref) -> ()
-// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -79,7 +79,7 @@ struct simple_float_rotation {
 // CHECK:           %[[VAL_12:.*]] = math.absf %[[VAL_11]] : f32
 // CHECK:           %[[VAL_13:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.rx (%[[VAL_12]]) %[[VAL_13]] : (f32, !quake.ref) -> ()
-// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_14:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
@@ -105,7 +105,7 @@ struct difficult_symphony {
 // CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<f32>
 // CHECK:           %[[VAL_7:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<1>) -> !quake.ref
 // CHECK:           quake.rx (%[[VAL_6]]) %[[VAL_7]] : (f32, !quake.ref) -> ()
-// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_8:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/vector_bool.cpp b/test/AST-Quake/vector_bool.cpp
index cf9a0459799..1a9e504308c 100644
--- a/test/AST-Quake/vector_bool.cpp
+++ b/test/AST-Quake/vector_bool.cpp
@@ -22,13 +22,15 @@ struct t1 {
 
 // clang-format off
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__t1(
-// CHECK-SAME:                                    %[[VAL_0:.*]]: !cc.stdvec<f64>) -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] name "vec" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:           %[[VAL_4:.*]] = quake.get_measure %[[VAL_3]]{{\[}}%[[VAL_1]]] : (!quake.measurements<2>, i64) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_5]] : i1
+// CHECK-SAME:        %[[VAL_0:.*]]: !cc.stdvec<f64>{{.*}}) -> i1 attributes
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_12:.*]] = quake.mz %[[VAL_1]] name "vec" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_2:.*]] = quake.discriminate %[[VAL_12]] :
+// CHECK:           %[[VAL_3:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
+// CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i8>
+// CHECK:           %[[VAL_6:.*]] = cc.cast %[[VAL_5]] : (i8) -> i1
+// CHECK:           return %[[VAL_6]] : i1
 // CHECK:         }
 // CHECK-NOT:     func.func private @_ZNKSt14_Bit_referencecvbEv() -> i1
 // clang-format on
@@ -36,18 +38,16 @@ struct t1 {
 struct VectorBoolReturn {
    std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(4);
-    auto res = mz(q);
-    return cudaq::to_bool_vector(res);
+    return mz(q);
   }
 };
 
 // clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorBoolReturn() -> !cc.stdvec<i1>
-// CHECK-SAME:      attributes {"cudaq-entrypoint"
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorBoolReturn() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "res" : (!quake.veq<4>) -> !quake.measurements<4>
-// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measurements<4>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
 // CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
 // CHECK:           %[[VAL_6:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_5]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
@@ -56,20 +56,24 @@ struct VectorBoolReturn {
 // CHECK:         }
 // clang-format on
 
-struct VectorMeasureResult {
-   std::vector<cudaq::measure_result> operator()() __qpu__ {
+struct VectorBoolResult {
+   std::vector<bool> operator()() __qpu__ {
     cudaq::qvector q(4);
-    return mz(q);
+    std::vector<bool> vec = mz(q);
+    return vec;
   }
 };
 
 // clang-format off
-// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorMeasureResult() -> !quake.measurements<?>
-// CHECK-NOT:     cudaq-entrypoint
-// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
-// CHECK:           %[[VAL_2:.*]] = quake.relax_size %[[VAL_1]] : (!quake.measurements<4>) -> !quake.measurements<?>
-// CHECK-NOT:       quake.discriminate
-// CHECK:           return %[[VAL_2]] : !quake.measurements<?>
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__VectorBoolResult() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "vec" : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<i1>) -> i64
+// CHECK:           %[[VAL_6:.*]] = call @__nvqpp_vectorCopyCtor(%[[VAL_4]], %[[VAL_5]], %[[VAL_0]]) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
+// CHECK:           %[[VAL_7:.*]] = cc.stdvec_init %[[VAL_6]], %[[VAL_5]] : (!cc.ptr<i8>, i64) -> !cc.stdvec<i1>
+// CHECK:           return %[[VAL_7]] : !cc.stdvec<i1>
 // CHECK:         }
 // clang-format on
diff --git a/test/AST-Quake/veq_size_init_state.cpp b/test/AST-Quake/veq_size_init_state.cpp
index 112b3c1d21a..acf59bef7cb 100644
--- a/test/AST-Quake/veq_size_init_state.cpp
+++ b/test/AST-Quake/veq_size_init_state.cpp
@@ -54,6 +54,6 @@ struct kernel {
 // CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_2]] : i64
 // CHECK:             cc.continue %[[VAL_22]] : i64
 // CHECK:           } {invariant}
-// CHECK:           %[[VAL_23:.*]] = quake.mz %[[VAL_14]] : (!quake.veq<?>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_23:.*]] = quake.mz %[[VAL_14]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/AST-error/run_struct_of_vec.cpp b/test/AST-error/run_struct_of_vec.cpp
index a79d3beb481..74e814f5613 100644
--- a/test/AST-error/run_struct_of_vec.cpp
+++ b/test/AST-error/run_struct_of_vec.cpp
@@ -23,12 +23,12 @@ struct Foo {
 struct Quark {
   Foo operator()() __qpu__ { // expected-error{{kernel result type not supported}}
     cudaq::qvector q(3);
-    return {747, cudaq::to_bool_vector(mz(q))};
+    return {747, mz(q)};
   }
 };
 
 int main() {
-  auto const result1 = cudaq::run(10, vec_of_vec);
+  auto const result1 = cudaq::run(10, vec_of_vec); 
   auto const result2 = cudaq::run(10, Quark{});
   return 0;
 }
diff --git a/test/Transforms/add_measurements-0.qke b/test/Transforms/add_measurements-0.qke
index 2f61e3ab649..0430bfa1b40 100644
--- a/test/Transforms/add_measurements-0.qke
+++ b/test/Transforms/add_measurements-0.qke
@@ -57,7 +57,7 @@ func.func @__nvqpp__mlirgen__bell_pair_no_mz() attributes {"cudaq-entrypoint", "
 // CHECK:           quake.x {{\[}}%[[VAL_1]]] %[[VAL_2]] : (!quake.ref, !quake.ref) -> ()
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/Transforms/add_measurements-1.qke b/test/Transforms/add_measurements-1.qke
index 4c4301728ea..6214cc576af 100644
--- a/test/Transforms/add_measurements-1.qke
+++ b/test/Transforms/add_measurements-1.qke
@@ -96,7 +96,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__caller = "__nvqpp
 // CHECK:           } {invariant}
 // CHECK:           cf.br ^bb1
 // CHECK:         ^bb1:
-// CHECK:           %[[VAL_20:.*]] = quake.mz %[[VAL_4]] : (!quake.veq<3>) -> !quake.measurements<3>
-// CHECK:           %[[VAL_21:.*]] = quake.mz %[[VAL_12]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_20:.*]] = quake.mz %[[VAL_4]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_21:.*]] = quake.mz %[[VAL_12]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/combine_measurements.qke b/test/Transforms/combine_measurements.qke
index d773e130a35..b6cb2db4a82 100644
--- a/test/Transforms/combine_measurements.qke
+++ b/test/Transforms/combine_measurements.qke
@@ -20,7 +20,7 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
 
 // CHECK-LABEL: func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -36,7 +36,7 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
   }
 // CHECK-LABEL: func.func @mz_2bits_extract_cst_op_index() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<2>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -51,7 +51,7 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
 
 // CHECK-LABEL: func.func @mz_2bits_extract_non_consecutive() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[2,\\222\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<3>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !quake.measurements<3>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -59,13 +59,13 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %c1_i64 = arith.constant 1 : i64
     %0 = quake.alloca !quake.veq<4>
     %1 = quake.subveq %0, %c1_i64, %c1_i64 : (!quake.veq<4>, i64, i64) -> !quake.veq<1>
-    %measOut = quake.mz %1 : (!quake.veq<1>) -> !quake.measurements<1>
+    %measOut = quake.mz %1 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @subveq_4_1() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[1,\\221\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -74,13 +74,13 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %c2_i64 = arith.constant 2 : i64
     %0 = quake.alloca !quake.veq<4>
     %1 = quake.subveq %0, %c1_i64, %c2_i64 : (!quake.veq<4>, i64, i64) -> !quake.veq<2>
-    %measOut = quake.mz %1 : (!quake.veq<2>) -> !quake.measurements<2>
+    %measOut = quake.mz %1 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @subveq_4_2() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[1,\\221\\22\]\],\[1,\[2,\\222\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -89,13 +89,13 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %c3_i64 = arith.constant 3 : i64
     %0 = quake.alloca !quake.veq<4>
     %1 = quake.subveq %0, %c0_i64, %c3_i64 : (!quake.veq<4>, i64, i64) -> !quake.veq<4>
-    %measOut = quake.mz %1 : (!quake.veq<4>) -> !quake.measurements<4>
+    %measOut = quake.mz %1 : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @subveq_4_4() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\],\[2,\[2,\\222\\22\]\],\[3,\[3,\\223\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 
@@ -104,15 +104,15 @@ func.func @mz_2bits_extract_cst_index() attributes {"cudaq-entrypoint"} {
     %1 = quake.extract_ref %0[1] : (!quake.veq<4>) -> !quake.ref
     %2 = quake.extract_ref %0[2] : (!quake.veq<4>) -> !quake.ref
     %3 = quake.subveq %0, 0, 1 : (!quake.veq<4>) -> !quake.veq<2>
-    %measOut = quake.mz %3 : (!quake.veq<2>) -> !quake.measurements<2>
+    %measOut = quake.mz %3 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     %4 = quake.subveq %0, 2, 2 : (!quake.veq<4>) -> !quake.veq<1>
-    %measOut_0 = quake.mz %4 : (!quake.veq<1>) -> !quake.measurements<1>
+    %measOut_0 = quake.mz %4 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
     return
   }
 
 // CHECK-LABEL: func.func @mz_2subveqs_extract() attributes {"cudaq-entrypoint", output_names = "{{\[\[\[0,\[0,\\220\\22\]\],\[1,\[1,\\221\\22\]\],\[2,\[2,\\222\\22\]\]\]\]}}"} {
 // CHECK:         %[[VAL_0:.*]] = quake.alloca !quake.veq<4>
-// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !quake.measurements<4>
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
 // CHECK:         return
 // CHECK:       }
 }
diff --git a/test/Transforms/convert_to_qir_measurements.qke b/test/Transforms/convert_to_qir_measurements.qke
deleted file mode 100644
index 97a97b595cd..00000000000
--- a/test/Transforms/convert_to_qir_measurements.qke
+++ /dev/null
@@ -1,180 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --convert-to-qir-api %s | FileCheck %s
-
-func.func @get_measure_lowering(%ms : !quake.measurements<?>) -> !quake.measure attributes {"cudaq-kernel"} {
-  %m = quake.get_measure %ms[0] : (!quake.measurements<?>) -> !quake.measure
-  return %m : !quake.measure
-}
-
-// CHECK-LABEL:   func.func @get_measure_lowering(
-// CHECK-SAME:                                    %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.ptr<!llvm.struct<"Result", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_2]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           return %[[VAL_3]] : !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:         }
-
-
-func.func @get_measure_dynamic_index(%ms : !quake.measurements<?>, %idx : index) -> !quake.measure attributes {"cudaq-kernel"} {
-  %m = quake.get_measure %ms[%idx] : (!quake.measurements<?>, index) -> !quake.measure
-  return %m : !quake.measure
-}
-
-// CHECK-LABEL:   func.func @get_measure_dynamic_index(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>,
-// CHECK-SAME:                                         %[[VAL_1:.*]]: index) -> !cc.ptr<!llvm.struct<"Result", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_2:.*]] = arith.index_cast %[[VAL_1]] : index to i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           return %[[VAL_4]] : !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:         }
-
-
-func.func @discriminate_i1(%m : !quake.measure) -> i1 attributes {"cudaq-kernel"} {
-  %bit = quake.discriminate %m : (!quake.measure) -> i1
-  return %bit : i1
-}
-
-// CHECK-LABEL:   func.func @discriminate_i1(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Result", opaque>>) -> i1 attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-// CHECK:           return %[[VAL_2]] : i1
-// CHECK:         }
-
-
-func.func @discriminate_i4(%m : !quake.measure) -> i4 attributes {"cudaq-kernel"} {
-  %bit = quake.discriminate %m : (!quake.measure) -> i4
-  return %bit : i4
-}
-
-// CHECK-LABEL:   func.func @discriminate_i4(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Result", opaque>>) -> i4 attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (i8) -> i4
-// CHECK:           return %[[VAL_3]] : i4
-// CHECK:         }
-
-
-func.func private @callee(%ms : !quake.measurements<?>) -> i1
-
-func.func @caller(%ms : !quake.measurements<?>) -> i1 attributes {"cudaq-kernel"} {
-  %r = call @callee(%ms) : (!quake.measurements<?>) -> i1
-  return %r : i1
-}
-
-// CHECK:         func.func private @callee(!cc.ptr<!llvm.struct<"Array", opaque>>) -> i1 attributes {"qir-api"}
-
-// CHECK-LABEL:   func.func @caller(
-// CHECK-SAME:                      %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> i1 attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = call @callee(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i1
-// CHECK:           return %[[VAL_1]] : i1
-// CHECK:         }
-
-
-func.func @discriminate_unsized(%ms : !quake.measurements<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel"} {
-  %bits = quake.discriminate %ms : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @discriminate_unsized(
-// CHECK-SAME:                                    %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// CHECK:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
-// CHECK:           %[[VAL_5:.*]] = cc.loop while ((%[[VAL_6:.*]] = %[[VAL_2]]) -> (i64)) {
-// CHECK:             %[[VAL_7:.*]] = arith.cmpi slt, %[[VAL_6]], %[[VAL_3]] : i64
-// CHECK:             cc.condition %[[VAL_7]](%[[VAL_6]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_8:.*]]: i64):
-// CHECK:             %[[VAL_9:.*]] = func.call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_8]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             %[[VAL_11:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:             %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<i1>
-// CHECK:             %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_8]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CHECK:             cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:             cc.continue %[[VAL_8]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_1]] : i64
-// CHECK:             cc.continue %[[VAL_16]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_18:.*]] = cc.stdvec_init %[[VAL_17]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_18]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @get_measure_i32_index(%ms : !quake.measurements<?>, %idx : i32) -> !quake.measure attributes {"cudaq-kernel"} {
-  %m = quake.get_measure %ms[%idx] : (!quake.measurements<?>, i32) -> !quake.measure
-  return %m : !quake.measure
-}
-
-// CHECK-LABEL:   func.func @get_measure_i32_index(
-// CHECK-SAME:                                     %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>,
-// CHECK-SAME:                                     %[[VAL_1:.*]]: i32) -> !cc.ptr<!llvm.struct<"Result", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_2:.*]] = cc.cast unsigned %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           return %[[VAL_4]] : !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:         }
-
-func.func @multi_qubit_mz_ref_and_veq() attributes {"cudaq-kernel", "cudaq-entrypoint"} {
-  %q = quake.alloca !quake.ref
-  %qs = quake.alloca !quake.veq<2>
-  %meas = quake.mz %q, %qs name "reg" : (!quake.ref, !quake.veq<2>) -> !quake.measurements<3>
-  return
-}
-
-// CHECK-LABEL:   func.func @multi_qubit_mz_ref_and_veq() attributes {"cudaq-entrypoint", "cudaq-kernel", "qir-api"} {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_3:.*]] = call @__quantum__rt__qubit_allocate() : () -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_4:.*]] = call @__quantum__rt__qubit_allocate_array(%[[VAL_2]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:           %[[VAL_5:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_4]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// CHECK:           %[[VAL_6:.*]] = arith.addi %[[VAL_5]], %[[VAL_0]] : i64
-// CHECK:           %[[VAL_7:.*]] = call @__quantum__rt__result_array_create_1d(%[[VAL_6]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:           %[[VAL_8:.*]] = cc.address_of @cstr.{{.*}} : !cc.ptr<!llvm.array<4 x i8>>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!llvm.array<4 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_3]], %[[VAL_9]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_11:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_7]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_12:.*]] = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_1]]) -> (i64)) {
-// CHECK:             %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_5]] : i64
-// CHECK:             cc.condition %[[VAL_14]](%[[VAL_13]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = func.call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_4]], %[[VAL_15]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:             %[[VAL_17:.*]] = cc.load %[[VAL_16]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:             %[[VAL_18:.*]] = func.call @__quantum__qis__mz__to__register(%[[VAL_17]], %[[VAL_9]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_0]] : i64
-// CHECK:             %[[VAL_20:.*]] = func.call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_7]], %[[VAL_19]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             cc.store %[[VAL_18]], %[[VAL_20]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:             cc.continue %[[VAL_15]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_21:.*]]: i64):
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_0]] : i64
-// CHECK:             cc.continue %[[VAL_22]] : i64
-// CHECK:           } {invariant}
-// CHECK:           return
-// CHECK:         }
-
-func.func @relax_size_measurements(%ms : !quake.measurements<4>) -> !quake.measurements<?> attributes {"cudaq-kernel"} {
-  %relaxed = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.measurements<?>
-  return %relaxed : !quake.measurements<?>
-}
-
-// CHECK-LABEL:   func.func @relax_size_measurements(
-// CHECK-SAME:                                       %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.ptr<!llvm.struct<"Array", opaque>> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           return %[[VAL_0]] : !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:         }
diff --git a/test/Transforms/cse.qke b/test/Transforms/cse.qke
index bd1de41a9c9..d095cf815e9 100644
--- a/test/Transforms/cse.qke
+++ b/test/Transforms/cse.qke
@@ -63,13 +63,13 @@ func.func private @device_kernel(!quake.veq<?>)
 func.func @canonicalize_concat() {
   %q1 = quake.alloca !quake.ref
   %q2 = quake.concat %q1 : (!quake.ref) -> !quake.veq<1>
-  %b1 = quake.mz %q2 : (!quake.veq<1>) -> !quake.measurements<1>
+  %b1 = quake.mz %q2 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
   %q3 = quake.alloca !quake.veq<1>
   %q4 = quake.concat %q3 : (!quake.veq<1>) -> !quake.veq<1>
-  %b2 = quake.mz %q4 : (!quake.veq<1>) -> !quake.measurements<1>
+  %b2 = quake.mz %q4 : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
   %q5 = quake.alloca !quake.veq<1>
   %q6 = quake.concat %q5 : (!quake.veq<1>) -> !quake.veq<?>
-  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !quake.measurements<?>
+  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
   %q7 = quake.alloca !quake.veq<2>
   %q8 = quake.concat %q7 : (!quake.veq<2>) -> !quake.veq<?>
   call @device_kernel(%q8) : (!quake.veq<?>) -> ()
@@ -83,11 +83,11 @@ func.func @canonicalize_concat() {
 // CHECK-LABEL:   func.func @canonicalize_concat() {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.concat %[[VAL_0]] : (!quake.ref) -> !quake.veq<1>
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<1>) -> !quake.measurements<1>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<1>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_8:.*]] = quake.relax_size %[[VAL_7]] : (!quake.veq<2>) -> !quake.veq<?>
 // CHECK:           call @device_kernel(%[[VAL_8]]) : (!quake.veq<?>) -> ()
@@ -101,13 +101,13 @@ func.func @canonicalize_multiple_concat() {
   %q1 = quake.alloca !quake.ref
   %p1 = quake.alloca !quake.ref
   %q2 = quake.concat %q1, %p1 : (!quake.ref, !quake.ref) -> !quake.veq<2>
-  %b1 = quake.mz %q2 : (!quake.veq<2>) -> !quake.measurements<2>
+  %b1 = quake.mz %q2 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   %q3 = quake.alloca !quake.veq<1>
   %q4 = quake.concat %q1, %p1, %q3 : (!quake.ref, !quake.ref, !quake.veq<1>) -> !quake.veq<3>
-  %b2 = quake.mz %q4 : (!quake.veq<3>) -> !quake.measurements<3>
+  %b2 = quake.mz %q4 : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
   %q5 = quake.alloca !quake.veq<1>
   %q6 = quake.concat %q3, %q5 : (!quake.veq<1>, !quake.veq<1>) -> !quake.veq<?>
-  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !quake.measurements<?>
+  %b3 = quake.mz %q6 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
   %q7 = quake.alloca !quake.veq<2>
   %q8 = quake.concat %q3, %q7 : (!quake.veq<1>, !quake.veq<2>) -> !quake.veq<?>
   call @device_kernel(%q8) : (!quake.veq<?>) -> ()
@@ -122,13 +122,13 @@ func.func @canonicalize_multiple_concat() {
 // CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
 // CHECK:           %[[VAL_2:.*]] = quake.concat %[[VAL_0]], %[[VAL_1]] : (!quake.ref, !quake.ref) -> !quake.veq<2>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<1>
 // CHECK:           %[[VAL_5:.*]] = quake.concat %[[VAL_0]], %[[VAL_1]], %[[VAL_4]] : (!quake.ref, !quake.ref, !quake.veq<1>) -> !quake.veq<3>
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<3>) -> !quake.measurements<3>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.veq<3>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<1>
 // CHECK:           %[[VAL_8:.*]] = quake.concat %[[VAL_4]], %[[VAL_7]] : (!quake.veq<1>, !quake.veq<1>) -> !quake.veq<2>
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_10:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_11:.*]] = quake.concat %[[VAL_4]], %[[VAL_10]] : (!quake.veq<1>, !quake.veq<2>) -> !quake.veq<3>
 // CHECK:           %[[VAL_12:.*]] = quake.relax_size %[[VAL_11]] : (!quake.veq<3>) -> !quake.veq<?>
diff --git a/test/Transforms/expand_and_qir_measurements.qke b/test/Transforms/expand_and_qir_measurements.qke
deleted file mode 100644
index 4cd4ac78de2..00000000000
--- a/test/Transforms/expand_and_qir_measurements.qke
+++ /dev/null
@@ -1,89 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --expand-measurements --convert-to-qir-api %s | FileCheck %s
-
-func.func @converter_func(%ms : !quake.measurements<2>) -> !cc.stdvec<i1> attributes {"cudaq-kernel"} {
-  %bits = quake.discriminate %ms : (!quake.measurements<2>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @converter_func(
-// CHECK-SAME:                              %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<i8 x 2>
-// CHECK:           %[[VAL_5:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_8:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = call @__quantum__rt__result_array_get_element_ptr_1d(%[[VAL_0]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<!cc.ptr<!llvm.struct<"Result", opaque>>>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_12]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_14:.*]] = cc.load %[[VAL_13]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_18:.*]] = cc.stdvec_init %[[VAL_17]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_18]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @combination_targets() -> !cc.stdvec<i1> attributes {"cudaq-kernel", "cudaq-entrypoint"} {
-  %q = quake.alloca !quake.ref
-  %qs = quake.alloca !quake.veq<2>
-  %meas = quake.mz %q, %qs name "mixed" : (!quake.ref, !quake.veq<2>) -> !quake.measurements<3>
-  %bits = quake.discriminate %meas : (!quake.measurements<3>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @combination_targets() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel", "qir-api"} {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_4:.*]] = call @__quantum__rt__qubit_allocate() : () -> !cc.ptr<!llvm.struct<"Qubit", opaque>>
-// CHECK:           %[[VAL_5:.*]] = call @__quantum__rt__qubit_allocate_array(%[[VAL_0]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
-// CHECK:           %[[VAL_6:.*]] = cc.address_of @cstr.6D6978656400 : !cc.ptr<!llvm.array<6 x i8>>
-// CHECK:           %[[VAL_7:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!llvm.array<6 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_4]], %[[VAL_7]]) {registerName = "mixed"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_9:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_5]], %[[VAL_1]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_10:.*]] = cc.load %[[VAL_9]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_11:.*]] = cc.address_of @cstr.6D6978656400 : !cc.ptr<!llvm.array<6 x i8>>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!llvm.array<6 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_10]], %[[VAL_12]]) {registerName = "mixed"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_14:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_5]], %[[VAL_2]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_15:.*]] = cc.load %[[VAL_14]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
-// CHECK:           %[[VAL_16:.*]] = cc.address_of @cstr.6D6978656400 : !cc.ptr<!llvm.array<6 x i8>>
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!llvm.array<6 x i8>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_18:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_15]], %[[VAL_17]]) {registerName = "mixed"} : (!cc.ptr<!llvm.struct<"Qubit", opaque>>, !cc.ptr<i8>) -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK:           %[[VAL_19:.*]] = cc.alloca !cc.array<i8 x 3>
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_21:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_22:.*]] = cc.cast %[[VAL_19]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_23:.*]] = cc.cast unsigned %[[VAL_21]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_23]], %[[VAL_22]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_13]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_26:.*]] = cc.compute_ptr %[[VAL_19]][1] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_27:.*]] = cc.cast unsigned %[[VAL_25]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_27]], %[[VAL_26]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = cc.cast %[[VAL_18]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_29:.*]] = cc.load %[[VAL_28]] : !cc.ptr<i1>
-// CHECK:           %[[VAL_30:.*]] = cc.compute_ptr %[[VAL_19]][2] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_31:.*]] = cc.cast unsigned %[[VAL_29]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_19]] : (!cc.ptr<!cc.array<i8 x 3>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_33:.*]] = cc.stdvec_init %[[VAL_32]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_33]] : !cc.stdvec<i1>
-// CHECK:         }
diff --git a/test/Transforms/expand_measurements.qke b/test/Transforms/expand_measurements.qke
deleted file mode 100644
index 35edfbaf8b0..00000000000
--- a/test/Transforms/expand_measurements.qke
+++ /dev/null
@@ -1,451 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --expand-measurements %s | FileCheck %s
-
-func.func @converter_sized(%ms : !quake.measurements<3>) -> !cc.stdvec<i1> {
-  %bits = quake.discriminate %ms : (!quake.measurements<3>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @converter_sized(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !quake.measurements<3>) -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i8{{\[}}%[[VAL_1]] : i64]
-// CHECK:           %[[VAL_3:.*]] = quake.get_measure %[[VAL_0]][0] : (!quake.measurements<3>) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_7:.*]] = quake.get_measure %[[VAL_0]][1] : (!quake.measurements<3>) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = quake.get_measure %[[VAL_0]][2] : (!quake.measurements<3>) -> !quake.measure
-// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_2]][2] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_16:.*]] = cc.stdvec_init %[[VAL_15]], %[[VAL_1]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_16]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @converter_single(%m : !quake.measure) -> i1 {
-  %bit = quake.discriminate %m : (!quake.measure) -> i1
-  return %bit : i1
-}
-
-// CHECK-LABEL:   func.func @converter_single(
-// CHECK-SAME:                                %[[VAL_0:.*]]: !quake.measure) -> i1 {
-// CHECK:           %[[VAL_1:.*]] = quake.discriminate %[[VAL_0]] : (!quake.measure) -> i1
-// CHECK:           return %[[VAL_1]] : i1
-// CHECK:         }
-
-func.func @converter_sized_i4(%ms : !quake.measurements<2>) -> !cc.stdvec<i4> {
-  %bits = quake.discriminate %ms : (!quake.measurements<2>) -> !cc.stdvec<i4>
-  return %bits : !cc.stdvec<i4>
-}
-
-// CHECK-LABEL:   func.func @converter_sized_i4(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !quake.measurements<2>) -> !cc.stdvec<i4> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_2:.*]] = cc.alloca i8{{\[}}%[[VAL_1]] : i64]
-// CHECK:           %[[VAL_3:.*]] = quake.get_measure %[[VAL_0]][0] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i4
-// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i4) -> i8
-// CHECK:           cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_7:.*]] = quake.get_measure %[[VAL_0]][1] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i4
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i4) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i4 x ?>>
-// CHECK:           %[[VAL_12:.*]] = cc.stdvec_init %[[VAL_11]], %[[VAL_1]] : (!cc.ptr<!cc.array<i4 x ?>>, i64) -> !cc.stdvec<i4>
-// CHECK:           return %[[VAL_12]] : !cc.stdvec<i4>
-// CHECK:         }
-
-func.func @expand_mz_veq_i3() -> !cc.stdvec<i3> {
-  %0 = quake.alloca !quake.veq<2>
-  %measOut = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
-  %bits = quake.discriminate %measOut : (!quake.measurements<2>) -> !cc.stdvec<i3>
-  return %bits : !cc.stdvec<i3>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_veq_i3() -> !cc.stdvec<i3> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_4]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_3]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_2]] : i64]
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i3
-// CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_8]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = cc.cast unsigned %[[VAL_9]] : (i3) -> i8
-// CHECK:           cc.store %[[VAL_11]], %[[VAL_10]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i3
-// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i3) -> i8
-// CHECK:           cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_15:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i3 x ?>>
-// CHECK:           %[[VAL_16:.*]] = cc.stdvec_init %[[VAL_15]], %[[VAL_2]] : (!cc.ptr<!cc.array<i3 x ?>>, i64) -> !cc.stdvec<i3>
-// CHECK:           return %[[VAL_16]] : !cc.stdvec<i3>
-// CHECK:         }
-
-func.func @expand_mz_ref_i1() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.ref
-  %1 = quake.alloca !quake.ref
-  %m = quake.mz %0, %1 : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-  %bits = quake.discriminate %m : (!quake.measurements<2>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_ref_i1() -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_2]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i8{{\[}}%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_3]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_10:.*]] = cc.compute_ptr %[[VAL_5]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = cc.cast unsigned %[[VAL_9]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_11]], %[[VAL_10]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_13:.*]] = cc.stdvec_init %[[VAL_12]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_13]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @callee(%q0 : !quake.ref, %q1 : !quake.ref) -> !quake.measurements<2> {
-  %m = quake.mz %q0, %q1 : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-  return %m : !quake.measurements<2>
-}
-
-func.func @caller() -> !cc.stdvec<i1> {
-  %q0 = quake.alloca !quake.ref
-  %q1 = quake.alloca !quake.ref
-  %ms = call @callee(%q0, %q1) : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-  %bits = quake.discriminate %ms : (!quake.measurements<2>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @callee(
-// CHECK-SAME:                      %[[VAL_0:.*]]: !quake.ref,
-// CHECK-SAME:                      %[[VAL_1:.*]]: !quake.ref) -> !quake.measurements<2> {
-// CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]] : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-// CHECK:           return %[[VAL_2]] : !quake.measurements<2>
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @caller() -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_3:.*]] = call @callee(%[[VAL_1]], %[[VAL_2]]) : (!quake.ref, !quake.ref) -> !quake.measurements<2>
-// CHECK:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_0]] : i64]
-// CHECK:           %[[VAL_5:.*]] = quake.get_measure %[[VAL_3]][0] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = quake.get_measure %[[VAL_3]][1] : (!quake.measurements<2>) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_12:.*]] = cc.cast unsigned %[[VAL_10]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_12]], %[[VAL_11]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_14:.*]] = cc.stdvec_init %[[VAL_13]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_14]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @expand_mz_mixed_ref_veq() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.ref
-  %1 = quake.alloca !quake.veq<2>
-  %m = quake.mz %0, %1 : (!quake.ref, !quake.veq<2>) -> !quake.measurements<3>
-  %bits = quake.discriminate %m : (!quake.measurements<3>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_mixed_ref_veq() -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 3 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_5:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = cc.alloca i8{{\[}}%[[VAL_2]] : i64]
-// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_5]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_11]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_13]], %[[VAL_12]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_14:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_15:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_16:.*]] = cc.cast unsigned %[[VAL_14]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_16]], %[[VAL_15]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_17:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_10]][2] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_19:.*]] = cc.cast unsigned %[[VAL_17]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_19]], %[[VAL_18]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_20:.*]] = cc.cast %[[VAL_10]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_21:.*]] = cc.stdvec_init %[[VAL_20]], %[[VAL_2]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_21]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @expand_mz_multi_veq() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.veq<2>
-  %1 = quake.alloca !quake.veq<3>
-  %m = quake.mz %0, %1 : (!quake.veq<2>, !quake.veq<3>) -> !quake.measurements<5>
-  %bits = quake.discriminate %m : (!quake.measurements<5>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-// CHECK-LABEL:   func.func @expand_mz_multi_veq() -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 2 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 5 : i64
-// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<3>
-// CHECK:           %[[VAL_6:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_6]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_10:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_0]]] : (!quake.veq<3>, i64) -> !quake.ref
-// CHECK:           %[[VAL_11:.*]] = quake.mz %[[VAL_10]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_12:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_1]]] : (!quake.veq<3>, i64) -> !quake.ref
-// CHECK:           %[[VAL_13:.*]] = quake.mz %[[VAL_12]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_14:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_2]]] : (!quake.veq<3>, i64) -> !quake.ref
-// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_14]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_16:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
-// CHECK:           %[[VAL_17:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_16]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_19:.*]] = cc.cast unsigned %[[VAL_17]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_19]], %[[VAL_18]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_20:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_16]][1] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_22:.*]] = cc.cast unsigned %[[VAL_20]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_22]], %[[VAL_21]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_23:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_24:.*]] = cc.compute_ptr %[[VAL_16]][2] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_25:.*]] = cc.cast unsigned %[[VAL_23]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_25]], %[[VAL_24]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_26:.*]] = quake.discriminate %[[VAL_13]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_27:.*]] = cc.compute_ptr %[[VAL_16]][3] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_28:.*]] = cc.cast unsigned %[[VAL_26]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_28]], %[[VAL_27]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_29:.*]] = quake.discriminate %[[VAL_15]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_30:.*]] = cc.compute_ptr %[[VAL_16]][4] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_31:.*]] = cc.cast unsigned %[[VAL_29]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_31]], %[[VAL_30]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_32:.*]] = cc.cast %[[VAL_16]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_33:.*]] = cc.stdvec_init %[[VAL_32]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_33]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @converter_unsized(%ms : !quake.measurements<?>) -> !cc.stdvec<i1> {
-  %bits = quake.discriminate %ms : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @converter_unsized(
-// CHECK-SAME:                                 %[[VAL_0:.*]]: !quake.measurements<?>) -> !cc.stdvec<i1> {
-// CHECK:           %[[VAL_1:.*]] = quake.discriminate %[[VAL_0]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_1]] : !cc.stdvec<i1>
-// CHECK:         }
-
-
-func.func @veq_to_measurements(%veq : !quake.veq<?>) -> !quake.measurements<?> {
-  %m = quake.mz %veq : (!quake.veq<?>) -> !quake.measurements<?>
-  return %m : !quake.measurements<?>
-}
-
-// CHECK-LABEL:   func.func @veq_to_measurements(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !quake.veq<?>) -> !quake.measurements<?> {
-// CHECK:           %[[VAL_1:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           return %[[VAL_1]] : !quake.measurements<?>
-// CHECK:         }
-
-func.func @dynamic_get_measure_used(%idx: i64, %ptr: !cc.ptr<i1>) {
-  %veq = quake.alloca !quake.veq<3>
-  %m = quake.mz %veq : (!quake.veq<3>) -> !quake.measurements<3>
-  %gm = quake.get_measure %m[%idx] : (!quake.measurements<3>, i64) -> !quake.measure
-  %bit = quake.discriminate %gm : (!quake.measure) -> i1
-  cc.store %bit, %ptr : !cc.ptr<i1>
-  quake.dealloc %veq : !quake.veq<3>
-  return
-}
-
-// CHECK-LABEL:   func.func @dynamic_get_measure_used(
-// CHECK-SAME:                                        %[[VAL_0:.*]]: i64,
-// CHECK-SAME:                                        %[[VAL_1:.*]]: !cc.ptr<i1>) {
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<3>
-// CHECK:           %[[VAL_3:.*]] = quake.mz %[[VAL_2]] : (!quake.veq<3>) -> !quake.measurements<3>
-// CHECK:           %[[VAL_4:.*]] = quake.get_measure %[[VAL_3]]{{\[}}%[[VAL_0]]] : (!quake.measurements<3>, i64) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           cc.store %[[VAL_5]], %[[VAL_1]] : !cc.ptr<i1>
-// CHECK:           quake.dealloc %[[VAL_2]] : !quake.veq<3>
-// CHECK:           return
-// CHECK:         }
-
-func.func @expand_mz_unsized_no_disc(%v : !quake.veq<?>) {
-  %m = quake.mz %v : (!quake.veq<?>) -> !quake.measurements<?>
-  return
-}
-
-// CHECK-LABEL:   func.func @expand_mz_unsized_no_disc(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: !quake.veq<?>) {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.veq_size %[[VAL_0]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_4:.*]] = cc.loop while ((%[[VAL_5:.*]] = %[[VAL_1]]) -> (i64)) {
-// CHECK:             %[[VAL_6:.*]] = arith.cmpi slt, %[[VAL_5]], %[[VAL_3]] : i64
-// CHECK:             cc.condition %[[VAL_6]](%[[VAL_5]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_7:.*]]: i64):
-// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_7]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:             %[[VAL_9:.*]] = quake.mz %[[VAL_8]] : (!quake.ref) -> !quake.measure
-// CHECK:             cc.continue %[[VAL_7]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_10:.*]]: i64):
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_2]] : i64
-// CHECK:             cc.continue %[[VAL_11]] : i64
-// CHECK:           } {invariant}
-// CHECK:           return
-// CHECK:         }
-
-func.func @expand_mz_sized_no_users() {
-  %0 = quake.alloca !quake.veq<2>
-  %m = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
-  return
-}
-
-// CHECK-LABEL:   func.func @expand_mz_sized_no_users() {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_1]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_5]] : (!quake.ref) -> !quake.measure
-// CHECK:           return
-// CHECK:         }
-
-func.func @expand_mz_unsized_ref_veq(%r : !quake.ref, %v : !quake.veq<?>) -> !cc.stdvec<i1> {
-  %m = quake.mz %r, %v : (!quake.ref, !quake.veq<?>) -> !quake.measurements<?>
-  %bits = quake.discriminate %m : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mz_unsized_ref_veq(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: !quake.ref,
-// CHECK-SAME:                                         %[[VAL_1:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_4:.*]] = quake.veq_size %[[VAL_1]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_4]], %[[VAL_2]] : i64
-// CHECK:           %[[VAL_6:.*]] = cc.alloca i8{{\[}}%[[VAL_5]] : i64]
-// CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_0]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_9:.*]] = cc.compute_ptr %[[VAL_6]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_10:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_10]], %[[VAL_9]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_11:.*]] = quake.veq_size %[[VAL_1]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_12:.*]] = cc.loop while ((%[[VAL_13:.*]] = %[[VAL_3]]) -> (i64)) {
-// CHECK:             %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_11]] : i64
-// CHECK:             cc.condition %[[VAL_14]](%[[VAL_13]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = quake.extract_ref %[[VAL_1]]{{\[}}%[[VAL_15]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:             %[[VAL_17:.*]] = quake.mz %[[VAL_16]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_18:.*]] = quake.discriminate %[[VAL_17]] : (!quake.measure) -> i1
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : i64
-// CHECK:             %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_6]]{{\[}}%[[VAL_19]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_21:.*]] = cc.cast unsigned %[[VAL_18]] : (i1) -> i8
-// CHECK:             cc.store %[[VAL_21]], %[[VAL_20]] : !cc.ptr<i8>
-// CHECK:             cc.continue %[[VAL_15]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_22:.*]]: i64):
-// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_2]] : i64
-// CHECK:             cc.continue %[[VAL_23]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_24:.*]] = cc.cast %[[VAL_6]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_25:.*]] = cc.stdvec_init %[[VAL_24]], %[[VAL_5]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_25]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @expand_mx_unsized_veq(%v : !quake.veq<?>) -> !cc.stdvec<i1> {
-  %m = quake.mx %v : (!quake.veq<?>) -> !quake.measurements<?>
-  %bits = quake.discriminate %m : (!quake.measurements<?>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_mx_unsized_veq(
-// CHECK-SAME:                                     %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = quake.veq_size %[[VAL_0]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_4:.*]] = cc.alloca i8{{\[}}%[[VAL_3]] : i64]
-// CHECK:           %[[VAL_5:.*]] = quake.veq_size %[[VAL_0]] : (!quake.veq<?>) -> i64
-// CHECK:           %[[VAL_6:.*]] = cc.loop while ((%[[VAL_7:.*]] = %[[VAL_1]]) -> (i64)) {
-// CHECK:             %[[VAL_8:.*]] = arith.cmpi slt, %[[VAL_7]], %[[VAL_5]] : i64
-// CHECK:             cc.condition %[[VAL_8]](%[[VAL_7]] : i64)
-// CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: i64):
-// CHECK:             %[[VAL_10:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_9]]] : (!quake.veq<?>, i64) -> !quake.ref
-// CHECK:             %[[VAL_11:.*]] = quake.mx %[[VAL_10]] : (!quake.ref) -> !quake.measure
-// CHECK:             %[[VAL_12:.*]] = quake.discriminate %[[VAL_11]] : (!quake.measure) -> i1
-// CHECK:             %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_4]]{{\[}}%[[VAL_9]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
-// CHECK:             %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CHECK:             cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CHECK:             cc.continue %[[VAL_9]] : i64
-// CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : i64
-// CHECK:             cc.continue %[[VAL_16]] : i64
-// CHECK:           } {invariant}
-// CHECK:           %[[VAL_17:.*]] = cc.cast %[[VAL_4]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_18:.*]] = cc.stdvec_init %[[VAL_17]], %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_18]] : !cc.stdvec<i1>
-// CHECK:         }
-
-func.func @expand_my_veq() -> !cc.stdvec<i1> {
-  %0 = quake.alloca !quake.veq<1>
-  %measOut = quake.my %0 : (!quake.veq<1>) -> !quake.measurements<1>
-  %bits = quake.discriminate %measOut : (!quake.measurements<1>) -> !cc.stdvec<i1>
-  return %bits : !cc.stdvec<i1>
-}
-
-// CHECK-LABEL:   func.func @expand_my_veq() -> !cc.stdvec<i1> {
-// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<1>
-// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_0]]] : (!quake.veq<1>, i64) -> !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.my %[[VAL_3]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = cc.alloca i8{{\[}}%[[VAL_1]] : i64]
-// CHECK:           %[[VAL_6:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_5]][0] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
-// CHECK:           %[[VAL_8:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i8
-// CHECK:           cc.store %[[VAL_8]], %[[VAL_7]] : !cc.ptr<i8>
-// CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_5]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<!cc.array<i1 x ?>>
-// CHECK:           %[[VAL_10:.*]] = cc.stdvec_init %[[VAL_9]], %[[VAL_1]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
-// CHECK:           return %[[VAL_10]] : !cc.stdvec<i1>
-// CHECK:         }
diff --git a/test/Transforms/invalid.qke b/test/Transforms/invalid.qke
index 381e2863ec9..70267203f83 100644
--- a/test/Transforms/invalid.qke
+++ b/test/Transforms/invalid.qke
@@ -226,36 +226,3 @@ func.func private @wonk(!quake.veq<18>, i32) -> f64
 
 // expected-error@+1 {{cannot classically allocate quake abstract type}}
 %0 = cc.alloca !quake.measure
-
-// -----
-
-func.func @relax_size_measurements_sized_result(%q : !quake.veq<4>) {
-  %ms = quake.mz %q : (!quake.veq<4>) -> !quake.measurements<4>
-  // expected-error@+1 {{result measurements type must not specify a size}}
-  %bad = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.measurements<4>
-  return
-}
-
-// -----
-
-func.func @relax_size_type_mismatch(%q : !quake.veq<4>) {
-  // expected-error@+1 {{input and result must both be measurements types}}
-  %bad = quake.relax_size %q : (!quake.veq<4>) -> !quake.measurements<?>
-  return
-}
-
-// -----
-
-func.func @relax_size_veq_sized_result(%q : !quake.veq<4>) {
-  // expected-error@+1 {{result veq type must not specify a size}}
-  %bad = quake.relax_size %q : (!quake.veq<4>) -> !quake.veq<4>
-  return
-}
-
-// -----
-
-func.func @relax_size_veq_type_mismatch(%ms : !quake.measurements<4>) {
-  // expected-error@+1 {{input and result must both be veq types}}
-  %bad = quake.relax_size %ms : (!quake.measurements<4>) -> !quake.veq<?>
-  return
-}
diff --git a/test/Transforms/kernel_exec-1.qke b/test/Transforms/kernel_exec-1.qke
index 035cdd64aab..24e20ab60b3 100644
--- a/test/Transforms/kernel_exec-1.qke
+++ b/test/Transforms/kernel_exec-1.qke
@@ -13,7 +13,7 @@
 module attributes {quake.mangled_name_map = {
   __nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
 
-  func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 attributes {"cudaq-entrypoint"} {
+  func.func @__nvqpp__mlirgen__ghz(%arg0: i32) -> f64 {
     %0 = cc.alloca i32
     cc.store %arg0, %0 : !cc.ptr<i32>
     %1 = cc.load %0 : !cc.ptr<i32>
@@ -446,3 +446,4 @@ module attributes {quake.mangled_name_map = {
 // HYBRID:           llvm.return
 // HYBRID:         }
 // HYBRID:         llvm.mlir.global_ctors {ctors = [@ghz.kernelRegFunc], priorities = [17 : i32]}
+
diff --git a/test/Transforms/loop.qke b/test/Transforms/loop.qke
index d209777fd8c..57b67eaad52 100644
--- a/test/Transforms/loop.qke
+++ b/test/Transforms/loop.qke
@@ -412,7 +412,7 @@ func.func @empty_step() {
     } step {
     }
   }
-  %2 = quake.mz %1 : (!quake.veq<?>) -> !quake.measurements<?>
+  %2 = quake.mz %1 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
   return
 }
 
diff --git a/test/Transforms/mapping_non_unitaries.qke b/test/Transforms/mapping_non_unitaries.qke
index d008ee613a8..353e8599016 100644
--- a/test/Transforms/mapping_non_unitaries.qke
+++ b/test/Transforms/mapping_non_unitaries.qke
@@ -30,7 +30,7 @@ func.func @test_measurement() {
   %3:2 = quake.x [%1] %0 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
   %4:2 = quake.x [%3#0] %2 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
   %5:2 = quake.x [%4#1] %3#1 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
-  %bits, %wires:3 = quake.mz %5#1, %4#0, %5#0 name "result": (!quake.wire, !quake.wire, !quake.wire) -> (!quake.measurements<3>, !quake.wire, !quake.wire, !quake.wire)
+  %bits, %wires:3 = quake.mz %5#1, %4#0, %5#0 name "result": (!quake.wire, !quake.wire, !quake.wire) -> (!cc.stdvec<!quake.measure>, !quake.wire, !quake.wire, !quake.wire)
   quake.return_wire %wires#0 : !quake.wire
   quake.return_wire %wires#1 : !quake.wire
   quake.return_wire %wires#2 : !quake.wire
@@ -45,7 +45,7 @@ func.func @test_measurement() {
 // CHECK:           %[[VAL_4:.*]]:2 = quake.x {{\[}}%[[VAL_3]]#0] %[[VAL_2]] : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
 // CHECK:           %[[VAL_5:.*]]:2 = quake.swap %[[VAL_4]]#1, %[[VAL_4]]#0 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
 // CHECK:           %[[VAL_6:.*]]:2 = quake.x {{\[}}%[[VAL_5]]#1] %[[VAL_3]]#1 : (!quake.wire, !quake.wire) -> (!quake.wire, !quake.wire)
-// CHECK:           %[[VAL_7:.*]], %[[VAL_8:.*]]:3 = quake.mz %[[VAL_6]]#1, %[[VAL_5]]#0, %[[VAL_6]]#0 name "result" : (!quake.wire, !quake.wire, !quake.wire) -> (!quake.measurements<3>, !quake.wire, !quake.wire, !quake.wire)
+// CHECK:           %[[VAL_7:.*]], %[[VAL_8:.*]]:3 = quake.mz %[[VAL_6]]#1, %[[VAL_5]]#0, %[[VAL_6]]#0 name "result" : (!quake.wire, !quake.wire, !quake.wire) -> (!cc.stdvec<!quake.measure>, !quake.wire, !quake.wire, !quake.wire)
 // CHECK-DAG:       quake.return_wire %[[VAL_8]]#0 : !quake.wire
 // CHECK-DAG:       quake.return_wire %[[VAL_8]]#1 : !quake.wire
 // CHECK-DAG:       quake.return_wire %[[VAL_8]]#2 : !quake.wire
diff --git a/test/Transforms/measurements_size.qke b/test/Transforms/measurements_size.qke
deleted file mode 100644
index 5edf280223c..00000000000
--- a/test/Transforms/measurements_size.qke
+++ /dev/null
@@ -1,57 +0,0 @@
-// ========================================================================== //
-// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
-// All rights reserved.                                                       //
-//                                                                            //
-// This source code and the accompanying materials are made available under   //
-// the terms of the Apache License 2.0 which accompanies this distribution.   //
-// ========================================================================== //
-
-// RUN: cudaq-opt --canonicalize %s | FileCheck %s
-// RUN: cudaq-opt --convert-to-qir-api %s | FileCheck --check-prefix=QIR %s
-// RUN: cudaq-opt --quake-to-qir %s | FileCheck --check-prefix=LLVM %s
-
-func.func @test_sized(%ms : !quake.measurements<4>) -> i64 {
-  %n = quake.measurements_size %ms : (!quake.measurements<4>) -> i64
-  return %n : i64
-}
-
-func.func @test_unsized(%ms : !quake.measurements<?>) -> i64 {
-  %n = quake.measurements_size %ms : (!quake.measurements<?>) -> i64
-  return %n : i64
-}
-
-// CHECK-LABEL:   func.func @test_sized(
-// CHECK-SAME:                          %[[VAL_0:.*]]: !quake.measurements<4>) -> i64 {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 4 : i64
-// CHECK:           return %[[VAL_1]] : i64
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @test_unsized(
-// CHECK-SAME:                            %[[VAL_0:.*]]: !quake.measurements<?>) -> i64 {
-// CHECK:           %[[VAL_1:.*]] = quake.measurements_size %[[VAL_0]] : (!quake.measurements<?>) -> i64
-// CHECK:           return %[[VAL_1]] : i64
-// CHECK:         }
-
-// QIR-LABEL:   func.func @test_sized(
-// QIR-SAME:                          %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> i64 attributes {"qir-api"} {
-// QIR:           %[[VAL_1:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// QIR:           return %[[VAL_1]] : i64
-// QIR:         }
-
-// QIR-LABEL:   func.func @test_unsized(
-// QIR-SAME:                            %[[VAL_0:.*]]: !cc.ptr<!llvm.struct<"Array", opaque>>) -> i64 attributes {"qir-api"} {
-// QIR:           %[[VAL_1:.*]] = call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> i64
-// QIR:           return %[[VAL_1]] : i64
-// QIR:         }
-
-// LLVM-LABEL:   llvm.func @test_sized(
-// LLVM-SAME:                          %[[VAL_0:.*]]: !llvm.ptr<struct<"Array", opaque>>) -> i64 {
-// LLVM:           %[[VAL_1:.*]] = llvm.call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!llvm.ptr<struct<"Array", opaque>>) -> i64
-// LLVM:           llvm.return %[[VAL_1]] : i64
-// LLVM:         }
-
-// LLVM-LABEL:   llvm.func @test_unsized(
-// LLVM-SAME:                            %[[VAL_0:.*]]: !llvm.ptr<struct<"Array", opaque>>) -> i64 {
-// LLVM:           %[[VAL_1:.*]] = llvm.call @__quantum__rt__array_get_size_1d(%[[VAL_0]]) : (!llvm.ptr<struct<"Array", opaque>>) -> i64
-// LLVM:           llvm.return %[[VAL_1]] : i64
-// LLVM:         }
diff --git a/test/Transforms/memtoreg-2.qke b/test/Transforms/memtoreg-2.qke
index a4594ab8c0a..aedd2f3c69b 100644
--- a/test/Transforms/memtoreg-2.qke
+++ b/test/Transforms/memtoreg-2.qke
@@ -182,7 +182,7 @@ func.func @classical_if06(%veq : !quake.veq<2>, %c1: i1) {
     %q1 = quake.extract_ref %veq[%c_1] : (!quake.veq<2>, i32) -> !quake.ref
     quake.y %q1 : (!quake.ref) -> ()
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -201,7 +201,7 @@ func.func @classical_if06(%veq : !quake.veq<2>, %c1: i1) {
 // CHECK:             quake.wrap %[[VAL_9]] to %[[VAL_7]] : !quake.wire, !quake.ref
 // CHECK:           } else {
 // CHECK:           }
-// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -216,7 +216,7 @@ func.func @classical_if07(%veq : !quake.veq<2>, %c1: i1, %c2: i1) {
       quake.reset %veq : (!quake.veq<2>) -> ()
     }
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -236,7 +236,7 @@ func.func @classical_if07(%veq : !quake.veq<2>, %c1: i1, %c2: i1) {
 // CHECK:             }
 // CHECK:           } else {
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -329,7 +329,7 @@ func.func @scope_local_extract_and_vec_measurement(%veq : !quake.veq<2>) {
     %q1 = quake.extract_ref %veq[%c_1] : (!quake.veq<2>,i32) -> !quake.ref
     quake.y %q1 : (!quake.ref) -> ()
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -344,7 +344,7 @@ func.func @scope_local_extract_and_vec_measurement(%veq : !quake.veq<2>) {
 // CHECK:             %[[VAL_6:.*]] = quake.unwrap %[[VAL_5]] : (!quake.ref) -> !quake.wire
 // CHECK:             %[[VAL_7:.*]] = quake.y %[[VAL_6]] : (!quake.wire) -> !quake.wire
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -356,7 +356,7 @@ func.func @vec_op_in_nested_scope(%veq : !quake.veq<2>) {
       quake.reset %veq : (!quake.veq<2>)-> ()
     }
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -370,7 +370,7 @@ func.func @vec_op_in_nested_scope(%veq : !quake.veq<2>) {
 // CHECK:               quake.reset %[[VAL_0]] : (!quake.veq<2>) -> ()
 // CHECK:             }
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -384,7 +384,7 @@ func.func @vec_op_in_nested_scope_and_local_extraction(%veq : !quake.veq<2>) {
       quake.reset %veq : (!quake.veq<2>) -> ()
     }
   }
-  %reg = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -401,7 +401,7 @@ func.func @vec_op_in_nested_scope_and_local_extraction(%veq : !quake.veq<2>) {
 // CHECK:               quake.reset %[[VAL_0]] : (!quake.veq<2>) -> ()
 // CHECK:             }
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_0]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -524,7 +524,7 @@ func.func @raw_cfg05(%c1: i1) {
 ^bb2:
   cf.br ^bb3
 ^bb3:
-  %reg = quake.mz %veq: (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq: (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -541,7 +541,7 @@ func.func @raw_cfg05(%c1: i1) {
 // CHECK:         ^bb2:
 // CHECK:           cf.br ^bb3
 // CHECK:         ^bb3:
-// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -562,7 +562,7 @@ func.func @raw_cfg06(%c1: i1) {
   %q2 = quake.extract_ref %veq[%c1_i64] : (!quake.veq<2>, i64) -> !quake.ref
   cf.br ^bb5
 ^bb5:
-  %reg = quake.mz %veq: (!quake.veq<2>)-> !quake.measurements<2>
+  %reg = quake.mz %veq: (!quake.veq<2>)-> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -588,7 +588,7 @@ func.func @raw_cfg06(%c1: i1) {
 // CHECK:           %[[VAL_9:.*]] = quake.unwrap %[[VAL_8]] : (!quake.ref) -> !quake.wire
 // CHECK:           cf.br ^bb5
 // CHECK:         ^bb5:
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -605,7 +605,7 @@ func.func @raw_cfg07(%c1: i1) {
   quake.reset %veq: (!quake.veq<2>)->()
   cf.br ^bb3
 ^bb3:
-  %reg = quake.mz %veq: (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %veq: (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -626,7 +626,7 @@ func.func @raw_cfg07(%c1: i1) {
 // CHECK:           quake.reset %[[VAL_3]] : (!quake.veq<2>) -> ()
 // CHECK:           cf.br ^bb3
 // CHECK:         ^bb3:
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -730,7 +730,7 @@ func.func @mz_and_reset_veq_with_extracted_refs() {
   %0 = quake.alloca !quake.veq<2>
   %q0 = quake.extract_ref %0[%c_0] : (!quake.veq<2>, i32) -> !quake.ref
   %q1 = quake.extract_ref %0[%c_1] : (!quake.veq<2>, i32) -> !quake.ref
-  %reg = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
+  %reg = quake.mz %0 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.reset %0 : (!quake.veq<2>) -> ()
   quake.dealloc %0 : !quake.veq<2>
   return
@@ -744,7 +744,7 @@ func.func @mz_and_reset_veq_with_extracted_refs() {
 // CHECK:           %[[VAL_4:.*]] = quake.unwrap %[[VAL_3]] : (!quake.ref) -> !quake.wire
 // CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_2]][%[[VAL_1]]] : (!quake.veq<2>, i32) -> !quake.ref
 // CHECK:           %[[VAL_6:.*]] = quake.unwrap %[[VAL_5]] : (!quake.ref) -> !quake.wire
-// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_2]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.reset %[[VAL_2]] : (!quake.veq<2>) -> ()
 // CHECK:           return
 // CHECK:         }
@@ -817,7 +817,7 @@ func.func @floop_with_vector_and_qextract() {
     %4 = arith.addi %3, %c1_i64 : i64
     cc.store %4, %alloca : !cc.ptr<i64>
   }
-  %2 = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %2 = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -843,7 +843,7 @@ func.func @floop_with_vector_and_qextract() {
 // CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_14]] : i64
 // CHECK:           }
-// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/memtoreg-3.qke b/test/Transforms/memtoreg-3.qke
index c59584b7bb4..96e4461d7e0 100644
--- a/test/Transforms/memtoreg-3.qke
+++ b/test/Transforms/memtoreg-3.qke
@@ -83,7 +83,7 @@ func.func @promote_induction_variable() {
     %4 = arith.addi %3, %c1_i64 : i64
     cc.store %4, %alloca : !cc.ptr<i64>
   }
-  %2 = quake.mz %veq : (!quake.veq<2>) -> !quake.measurements<2>
+  %2 = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
@@ -109,7 +109,7 @@ func.func @promote_induction_variable() {
 // CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_1]] : i64
 // CHECK:             cc.continue %[[VAL_14]] : i64
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
@@ -133,7 +133,7 @@ func.func @promote_induction_variable() {
 // TOMEM:             %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_1]] : i64
 // TOMEM:             cc.continue %[[VAL_12]] : i64
 // TOMEM:           }
-// TOMEM:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !quake.measurements<2>
+// TOMEM:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // TOMEM:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // TOMEM:           return
 // TOMEM:         }
diff --git a/test/Transforms/memtoreg-7.qke b/test/Transforms/memtoreg-7.qke
index 3b234eba0a8..111d7934abb 100644
--- a/test/Transforms/memtoreg-7.qke
+++ b/test/Transforms/memtoreg-7.qke
@@ -30,8 +30,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
     %7 = cc.load %1 : !cc.ptr<i1>
     %8 = arith.cmpi eq, %7, %false : i1
     cc.if(%8) {
-      %measOut_0 = quake.mz %0 name "inner_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-      %9 = quake.discriminate %measOut_0 : (!quake.measurements<2>) -> !cc.stdvec<i1>
+      %measOut_0 = quake.mz %0 name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+      %9 = quake.discriminate %measOut_0 : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
       cc.scope {
         %10 = cc.alloca !cc.stdvec<i1>
         cc.store %9, %10 : !cc.ptr<!cc.stdvec<i1>>
@@ -46,8 +46,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
   %3 = cc.load %1 : !cc.ptr<i1>
   %4 = arith.cmpi eq, %3, %true : i1
   cc.if(%4) {
-    %measOut = quake.mz %0 name "outer_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-    %5 = quake.discriminate %measOut : (!quake.measurements<2>) -> !cc.stdvec<i1>
+    %measOut = quake.mz %0 name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+    %5 = quake.discriminate %measOut : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
     cc.scope {
       %6 = cc.alloca !cc.stdvec<i1>
       cc.store %5, %6 : !cc.ptr<!cc.stdvec<i1>>
@@ -76,8 +76,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CHECK:             %[[VAL_17:.*]] = quake.discriminate %[[VAL_15]] : (!quake.measure) -> i1
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi eq, %[[VAL_17]], %[[VAL_0]] : i1
 // CHECK:             cc.if(%[[VAL_18]]) {
-// CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_5]] name "inner_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:               %[[VAL_20:.*]] = quake.discriminate %[[VAL_19]] : (!quake.measurements<2>) -> !cc.stdvec<i1>
+// CHECK:               %[[VAL_19:.*]] = quake.mz %[[VAL_5]] name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:               %[[VAL_20:.*]] = quake.discriminate %[[VAL_19]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:               cc.scope {
 // CHECK:                 %[[VAL_21:.*]] = cc.undef !cc.stdvec<i1>
 // CHECK:               }
@@ -91,8 +91,8 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CHECK:           } {invariant}
 // CHECK:           %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_26:.*]]#1, %[[VAL_3]] : i1
 // CHECK:           cc.if(%[[VAL_25]]) {
-// CHECK:             %[[VAL_27:.*]] = quake.mz %[[VAL_5]] name "outer_mz" : (!quake.veq<2>) -> !quake.measurements<2>
-// CHECK:             %[[VAL_28:.*]] = quake.discriminate %[[VAL_27]] : (!quake.measurements<2>) -> !cc.stdvec<i1>
+// CHECK:             %[[VAL_27:.*]] = quake.mz %[[VAL_5]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:             %[[VAL_28:.*]] = quake.discriminate %[[VAL_27]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:             cc.scope {
 // CHECK:               %[[VAL_29:.*]] = cc.undef !cc.stdvec<i1>
 // CHECK:             }
@@ -112,18 +112,18 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:           %[[VAL_7:.*]] = quake.discriminate %[[VAL_6]] : (!quake.measure) -> i1
 // CANOE:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_1]] : i1
 // CANOE:           cc.if(%[[VAL_8]]) {
-// CANOE:             %[[VAL_9:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_10:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_11:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_12:.*]] = quake.discriminate %[[VAL_9]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_13:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_14:.*]] = cc.cast unsigned %[[VAL_12]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_14]], %[[VAL_13]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_15:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_11]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_9:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_10:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_11:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_12:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_11]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_13]], %[[VAL_12]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_14:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_15:.*]] = quake.discriminate %[[VAL_14]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_16:.*]] = cc.compute_ptr %[[VAL_9]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_17:.*]] = cc.cast unsigned %[[VAL_15]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_17]], %[[VAL_16]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_18:.*]] = cc.cast %[[VAL_11]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_18:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_19:.*]] = cc.stdvec_init %[[VAL_18]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_20:.*]] = cc.undef !cc.stdvec<i1>
@@ -134,18 +134,18 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:           %[[VAL_22:.*]] = quake.discriminate %[[VAL_21]] : (!quake.measure) -> i1
 // CANOE:           %[[VAL_23:.*]] = arith.cmpi eq, %[[VAL_22]], %[[VAL_1]] : i1
 // CANOE:           cc.if(%[[VAL_23]]) {
-// CANOE:             %[[VAL_24:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_25:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_26:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_27:.*]] = quake.discriminate %[[VAL_24]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_28:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_29:.*]] = cc.cast unsigned %[[VAL_27]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_29]], %[[VAL_28]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_30:.*]] = quake.discriminate %[[VAL_25]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_26]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_24:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_25:.*]] = quake.mz %[[VAL_3]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_26:.*]] = quake.discriminate %[[VAL_25]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_27:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_28:.*]] = cc.cast unsigned %[[VAL_26]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_28]], %[[VAL_27]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_29:.*]] = quake.mz %[[VAL_4]] name "inner_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_30:.*]] = quake.discriminate %[[VAL_29]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_31:.*]] = cc.compute_ptr %[[VAL_24]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_32:.*]] = cc.cast unsigned %[[VAL_30]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_32]], %[[VAL_31]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_33:.*]] = cc.cast %[[VAL_26]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_33:.*]] = cc.cast %[[VAL_24]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_34:.*]] = cc.stdvec_init %[[VAL_33]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_35:.*]] = cc.undef !cc.stdvec<i1>
@@ -154,18 +154,18 @@ func.func @__nvqpp__mlirgen__test() attributes {"cudaq-entrypoint", qubitMeasure
 // CANOE:           }
 // CANOE:           %[[VAL_36:.*]] = arith.cmpi eq, %[[VAL_22]], %[[VAL_2]] : i1
 // CANOE:           cc.if(%[[VAL_36]]) {
-// CANOE:             %[[VAL_37:.*]] = quake.mz %[[VAL_3]] name "outer_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_38:.*]] = quake.mz %[[VAL_4]] name "outer_mz" : (!quake.ref) -> !quake.measure
-// CANOE:             %[[VAL_39:.*]] = cc.alloca !cc.array<i8 x 2>
-// CANOE:             %[[VAL_40:.*]] = quake.discriminate %[[VAL_37]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_41:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
-// CANOE:             %[[VAL_42:.*]] = cc.cast unsigned %[[VAL_40]] : (i1) -> i8
-// CANOE:             cc.store %[[VAL_42]], %[[VAL_41]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_43:.*]] = quake.discriminate %[[VAL_38]] : (!quake.measure) -> i1
-// CANOE:             %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_39]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_37:.*]] = cc.alloca !cc.array<i8 x 2>
+// CANOE:             %[[VAL_38:.*]] = quake.mz %[[VAL_3]] name "outer_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_39:.*]] = quake.discriminate %[[VAL_38]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_40:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
+// CANOE:             %[[VAL_41:.*]] = cc.cast unsigned %[[VAL_39]] : (i1) -> i8
+// CANOE:             cc.store %[[VAL_41]], %[[VAL_40]] : !cc.ptr<i8>
+// CANOE:             %[[VAL_42:.*]] = quake.mz %[[VAL_4]] name "outer_mz" : (!quake.ref) -> !quake.measure
+// CANOE:             %[[VAL_43:.*]] = quake.discriminate %[[VAL_42]] : (!quake.measure) -> i1
+// CANOE:             %[[VAL_44:.*]] = cc.compute_ptr %[[VAL_37]][1] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<i8>
 // CANOE:             %[[VAL_45:.*]] = cc.cast unsigned %[[VAL_43]] : (i1) -> i8
 // CANOE:             cc.store %[[VAL_45]], %[[VAL_44]] : !cc.ptr<i8>
-// CANOE:             %[[VAL_46:.*]] = cc.cast %[[VAL_39]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
+// CANOE:             %[[VAL_46:.*]] = cc.cast %[[VAL_37]] : (!cc.ptr<!cc.array<i8 x 2>>) -> !cc.ptr<!cc.array<i1 x ?>>
 // CANOE:             %[[VAL_47:.*]] = cc.stdvec_init %[[VAL_46]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
 // CANOE:             cc.scope {
 // CANOE:               %[[VAL_48:.*]] = cc.undef !cc.stdvec<i1>
diff --git a/test/Transforms/mz.qke b/test/Transforms/mz.qke
index 4701477c396..0ea14e788b4 100644
--- a/test/Transforms/mz.qke
+++ b/test/Transforms/mz.qke
@@ -13,7 +13,7 @@ func.func @static.mz_test() {
   %1 = quake.alloca  !quake.veq<4>
   %2 = quake.alloca  !quake.veq<2>
   %3 = quake.alloca  !quake.ref
-  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<?>
+  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -22,7 +22,7 @@ func.func @static.mz_test() {
 // CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.veq<4>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<2>
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !quake.measurements<?>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] : (!quake.ref, !quake.veq<4>, !quake.veq<2>, !quake.ref) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
@@ -31,7 +31,7 @@ func.func @dynamic.mz_test(%arg0 : i32, %arg1 : i32) {
   %1 = quake.alloca !quake.veq<?>[%arg0 : i32]
   %2 = quake.alloca !quake.veq<?>[%arg1 : i32]
   %3 = quake.alloca  !quake.ref
-  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !quake.measurements<?>
+  quake.mz %0, %1, %2, %3 : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !cc.stdvec<!quake.measure>
   return
 }
 
@@ -41,7 +41,7 @@ func.func @dynamic.mz_test(%arg0 : i32, %arg1 : i32) {
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<?>[%[[VAL_0]] : i32]
 // CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<?>[%[[VAL_1]] : i32]
 // CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.ref
-// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !quake.measurements<?>
+// CHECK:           %[[VAL_6:.*]] = quake.mz %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] : (!quake.ref, !quake.veq<?>, !quake.veq<?>, !quake.ref) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/Transforms/propagate_metadata_apply.qke b/test/Transforms/propagate_metadata_apply.qke
index 41f7d7e2998..99b4a2b8272 100644
--- a/test/Transforms/propagate_metadata_apply.qke
+++ b/test/Transforms/propagate_metadata_apply.qke
@@ -15,7 +15,7 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__controlled_operat
     quake.h %1 : (!quake.ref) -> ()
     %2 = quake.extract_ref %0[1] : (!quake.veq<2>) -> !quake.ref
     quake.apply %arg0 [%1] %2 : (!quake.ref, !quake.ref) -> ()
-    %measOut = quake.mz %0 : (!quake.veq<2>) -> !quake.measurements<2>
+    %measOut = quake.mz %0 : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
     return
   }
 }
@@ -29,6 +29,6 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__controlled_operat
 // CHECK:           quake.h %[[VAL_2]] : (!quake.ref) -> ()
 // CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_1]][1] : (!quake.veq<2>) -> !quake.ref
 // CHECK:           quake.apply %[[VAL_0]] {{\[}}%[[VAL_2]]] %[[VAL_3]] : (!quake.ref, !quake.ref) -> ()
-// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !quake.measurements<2>
+// CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Transforms/quake-errors.qke b/test/Transforms/quake-errors.qke
index 68b20d83cf7..08442c9d47a 100644
--- a/test/Transforms/quake-errors.qke
+++ b/test/Transforms/quake-errors.qke
@@ -493,11 +493,3 @@ func.func @test(%0: !quake.veq<3>, %1: !quake.veq<2>, %2: !quake.ref) {
   %4 = quake.concat %0, %1, %2 : (!quake.veq<3>, !quake.veq<2>, !quake.ref) -> !quake.veq<34>
   return
 }
-
-// -----
-
-func.func @test(%ms : !quake.measurements<4>) -> !quake.measure {
-  // expected-error @+1 {{'quake.get_measure' op invalid index [4] because >= size [4]}}
-  %m = quake.get_measure %ms[4] : (!quake.measurements<4>) -> !quake.measure
-  return %m : !quake.measure
-}
diff --git a/test/Transforms/resource_count_preprocess.qke b/test/Transforms/resource_count_preprocess.qke
index 5aaf239857a..23aed20fb6e 100644
--- a/test/Transforms/resource_count_preprocess.qke
+++ b/test/Transforms/resource_count_preprocess.qke
@@ -58,7 +58,7 @@ func.func @kernel2() {
 // CHECK: Preprocessing h(0) for 9 counts
 // CHECK-LABEL:   func.func @kernel3() {
 // CHECK:     %0 = quake.alloca !quake.veq<10>
-// CHECK:     %measOut = quake.mz %0 : (!quake.veq<10>) -> !quake.measurements<10>
+// CHECK:     %measOut = quake.mz %0 : (!quake.veq<10>) -> !cc.stdvec<!quake.measure>
 // CHECK:     return
 // CHECK:   }
 
@@ -82,7 +82,7 @@ func.func @kernel3() {
     %2 = arith.addi %arg0, %c1_i64 : i64
     cc.continue %2 : i64
   }
-  %measOut = quake.mz %0 : (!quake.veq<10>) -> !quake.measurements<10>
+  %measOut = quake.mz %0 : (!quake.veq<10>) -> !cc.stdvec<!quake.measure>
   return
 }
 
diff --git a/test/Transforms/return_vector.qke b/test/Transforms/return_vector.qke
index de4f72ff646..ddeccffa1c8 100644
--- a/test/Transforms/return_vector.qke
+++ b/test/Transforms/return_vector.qke
@@ -15,7 +15,7 @@ module attributes{ quake.mangled_name_map = {
 
 func.func private @malloc(i64) -> !cc.ptr<i8>
 
-func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i32> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i32> {
   %0 = arith.constant 256 : i64
   %1 = call @malloc(%0) : (i64) -> !cc.ptr<i8>
   %2 = arith.constant 8 : i64
@@ -28,7 +28,7 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_0(
-// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<i32> 
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<i32> {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 256 : i64
 // CHECK:           %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr<i8>
@@ -113,7 +113,7 @@ func.func @test_0(%0: !cc.ptr<!cc.struct<{!cc.ptr<i32>, !cc.ptr<i32>, !cc.ptr<i3
 // CHECK:           return
 // CHECK:         }
 
-func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec<f64> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_1(%arg0: i32) -> !cc.stdvec<f64> {
   %0 = arith.constant 520 : i64
   %1 = call @malloc(%0) : (i64) -> !cc.ptr<i8>
   %2 = arith.constant 9 : i64
@@ -126,7 +126,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__test_1(
-// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<f64> 
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> !cc.stdvec<f64> {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 9 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 520 : i64
 // CHECK:           %[[VAL_3:.*]] = call @malloc(%[[VAL_2]]) : (i64) -> !cc.ptr<i8>
@@ -236,7 +236,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK-SAME:                                                   %[[VAL_0:.*]]: !cc.ptr<i8>,
 // CHECK-SAME:                                                   %[[VAL_1:.*]]: i64,
 // CHECK-SAME:                                                   %[[VAL_2:.*]]: !cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>,
-// CHECK-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> 
+// CHECK-SAME:                                                   %[[VAL_3:.*]]: i64) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_4:.*]] = arith.constant false
 // CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<{!cc.ptr<i8>, i64}>>) -> !cc.ptr<i64>
 // CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_5]] : !cc.ptr<i64>
@@ -265,7 +265,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_0.thunk(
 // CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
-// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> 
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}>>
 // CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<i32>, i64}>}> : i64
 // CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
@@ -287,7 +287,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_0.argsCreator(
 // CHECK-SAME:                                  %[[VAL_0:.*]]: !cc.ptr<!cc.ptr<i8>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 
+// CHECK-SAME:                                  %[[VAL_1:.*]]: !cc.ptr<!cc.ptr<i8>>) -> i64 {
 // CHECK:           %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr<!cc.ptr<i8>>
 // CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
 // CHECK:           %[[VAL_4:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i32>
@@ -320,7 +320,7 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 
 // CHECK-LABEL:   func.func @test_1.thunk(
 // CHECK-SAME:        %[[VAL_0:.*]]: !cc.ptr<i8>,
-// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> 
+// CHECK-SAME:        %[[VAL_1:.*]]: i1) -> !cc.struct<{!cc.ptr<i8>, i64}> {
 // CHECK:           %[[VAL_2:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<!cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}>>
 // CHECK:           %[[VAL_3:.*]] = cc.sizeof !cc.struct<{i32, !cc.struct<{!cc.ptr<f64>, i64}>}> : i64
 // CHECK:           %[[VAL_4:.*]] = cc.cast %[[VAL_0]] : (!cc.ptr<i8>) -> !cc.ptr<i32>
@@ -366,3 +366,4 @@ func.func @test_1(%0: !cc.ptr<!cc.struct<{!cc.ptr<f64>, !cc.ptr<f64>, !cc.ptr<f6
 // CHECK:           llvm.return
 // CHECK:         }
 // CHECK:         llvm.mlir.global_ctors {ctors = [@test_1.kernelRegFunc], priorities = [17 : i32]}
+
diff --git a/test/Transforms/roundtrip-ops.qke b/test/Transforms/roundtrip-ops.qke
index 28649a90694..89a7bbbfab7 100644
--- a/test/Transforms/roundtrip-ops.qke
+++ b/test/Transforms/roundtrip-ops.qke
@@ -69,11 +69,10 @@ func.func @quantum_ops() {
   quake.u3 (%f, %g, %h) %7 : (f32, f32, f32, !quake.ref) -> ()
 
   %15 = quake.mx %4 : (!quake.ref) -> !quake.measure
-  %16 = quake.my %5 : (!quake.veq<?>) -> !quake.measurements<?>
-  %17 = quake.mz %6 : (!quake.veq<5>) -> !quake.measurements<5>
-  %ms_relaxed = quake.relax_size %17 : (!quake.measurements<5>) -> !quake.measurements<?>
+  %16 = quake.my %5 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+  %17 = quake.mz %6 : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
   %z15 = quake.discriminate %15 : (!quake.measure) -> i1
-  %z16 = quake.discriminate %16 : (!quake.measurements<?>) -> !cc.stdvec<i1>
+  %z16 = quake.discriminate %16 : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 
   // Quantum operations, wire form
   %19 = cc.undef i32 {wires = true}
@@ -217,11 +216,10 @@ cc.global constant private @quantum_ops.rodata_synth_0 (dense<[(0.707106769,0.00
 // CHECK:           %[[VAL_22:.*]] = arith.constant 3.400000e+01 : f32
 // CHECK:           quake.u3 (%[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) %[[VAL_10]] : (f32, f32, f32, !quake.ref) -> ()
 // CHECK:           %[[VAL_23:.*]] = quake.mx %[[VAL_5]] : (!quake.ref) -> !quake.measure
-// CHECK:           %[[VAL_24:.*]] = quake.my %[[VAL_8]] : (!quake.veq<?>) -> !quake.measurements<?>
-// CHECK:           %[[VAL_25:.*]] = quake.mz %[[VAL_9]] : (!quake.veq<5>) -> !quake.measurements<5>
-// CHECK:           %[[VAL_125:.*]] = quake.relax_size %[[VAL_25]] : (!quake.measurements<5>) -> !quake.measurements<?>
+// CHECK:           %[[VAL_24:.*]] = quake.my %[[VAL_8]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_25:.*]] = quake.mz %[[VAL_9]] : (!quake.veq<5>) -> !cc.stdvec<!quake.measure>
 // CHECK:           %[[VAL_123:.*]] = quake.discriminate %[[VAL_23]] : (!quake.measure) -> i1
-// CHECK:           %[[VAL_124:.*]] = quake.discriminate %[[VAL_24]] : (!quake.measurements<?>) -> !cc.stdvec<i1>
+// CHECK:           %[[VAL_124:.*]] = quake.discriminate %[[VAL_24]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 // CHECK:           %[[VAL_26:.*]] = cc.undef i32 {wires = true}
 // CHECK:           %[[VAL_27:.*]] = quake.null_wire
 // CHECK:           %[[VAL_28:.*]] = quake.null_wire
@@ -977,28 +975,3 @@ func.func @integrated_device() {
 // CHECK:           %[[VAL_14:.*]] = cc.device_call @integrated_device_callback<%[[VAL_2]], %[[VAL_10]], %[[VAL_11]] * %[[VAL_3]], %[[VAL_12]], %[[VAL_13]]> on %[[VAL_5]](%[[VAL_0]]) : (i64, i64, i64, i64, i64, i64, i64, i64) -> i64
 // CHECK:           return
 // CHECK:         }
-
-func.func @measurements_ops(%ms4 : !quake.measurements<4>,
-                            %msd : !quake.measurements<?>,
-                            %idx : index) {
-  %m0 = quake.get_measure %ms4[0] : (!quake.measurements<4>) -> !quake.measure
-  %m_dyn = quake.get_measure %ms4[%idx] : (!quake.measurements<4>, index) -> !quake.measure
-  %c2 = arith.constant 2 : i64
-  %m_unsized = quake.get_measure %msd[%c2] : (!quake.measurements<?>, i64) -> !quake.measure
-  %n4 = quake.measurements_size %ms4 : (!quake.measurements<4>) -> i64
-  %nd = quake.measurements_size %msd : (!quake.measurements<?>) -> i64
-  return
-}
-
-// CHECK-LABEL:   func.func @measurements_ops(
-// CHECK-SAME:                                %[[VAL_0:.*]]: !quake.measurements<4>,
-// CHECK-SAME:                                %[[VAL_1:.*]]: !quake.measurements<?>,
-// CHECK-SAME:                                %[[VAL_2:.*]]: index) {
-// CHECK:           %[[VAL_3:.*]] = quake.get_measure %[[VAL_0]][0] : (!quake.measurements<4>) -> !quake.measure
-// CHECK:           %[[VAL_4:.*]] = quake.get_measure %[[VAL_0]]{{\[}}%[[VAL_2]]] : (!quake.measurements<4>, index) -> !quake.measure
-// CHECK:           %[[VAL_5:.*]] = arith.constant 2 : i64
-// CHECK:           %[[VAL_6:.*]] = quake.get_measure %[[VAL_1]]{{\[}}%[[VAL_5]]] : (!quake.measurements<?>, i64) -> !quake.measure
-// CHECK:           %[[VAL_7:.*]] = quake.measurements_size %[[VAL_0]] : (!quake.measurements<4>) -> i64
-// CHECK:           %[[VAL_8:.*]] = quake.measurements_size %[[VAL_1]] : (!quake.measurements<?>) -> i64
-// CHECK:           return
-// CHECK:         }
diff --git a/test/Translate/OpenQASM/basic.qke b/test/Translate/OpenQASM/basic.qke
index 6db71117e66..3eab2c14b7c 100644
--- a/test/Translate/OpenQASM/basic.qke
+++ b/test/Translate/OpenQASM/basic.qke
@@ -71,7 +71,7 @@ module {
 
     quake.apply @umaj %cout, %b0, %a0 : (!quake.ref, !quake.ref, !quake.ref) -> ()
 
-    %ans = quake.mz %b : (!quake.veq<4>) -> !quake.measurements<4>
+    %ans = quake.mz %b : (!quake.veq<4>) -> !cc.stdvec<!quake.measure>
     %ans_cout = quake.mz %cout : (!quake.ref) -> !quake.measure
     return
   }
diff --git a/test/Translate/argument.qke b/test/Translate/argument.qke
index f9a0a4859e5..220e945eacf 100644
--- a/test/Translate/argument.qke
+++ b/test/Translate/argument.qke
@@ -18,7 +18,7 @@ module
 
 func.func private @anchor(!cc.ptr<none>, i64)
 
-func.func @__nvqpp__mlirgen__test_0(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_0(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) {
   %0 = cc.stdvec_data %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
   %1 = cc.stdvec_size %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> i64
   %2 = cc.cast %0 : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<none>
@@ -74,7 +74,7 @@ func.func @test_0(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_1(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_1(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) {
   %0 = cc.extract_value %arg0[0] : (!cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) -> !cc.stdvec<i16>
   %1 = cc.stdvec_data %0 : (!cc.stdvec<i16>) -> !cc.ptr<!cc.array<i16 x ?>>
   %2 = cc.stdvec_size %0 : (!cc.stdvec<i16>) -> i64
@@ -156,7 +156,7 @@ func.func @test_1(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_2(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_2(%arg0: !cc.stdvec<!cc.struct<{i32, f64}>>) {
   %0 = cc.stdvec_data %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> !cc.ptr<!cc.struct<{i32, f64}>>
   %1 = cc.stdvec_size %arg0 : (!cc.stdvec<!cc.struct<{i32, f64}>>) -> i64
   %2 = cc.cast %0 : (!cc.ptr<!cc.struct<{i32, f64}>>) -> !cc.ptr<none>
@@ -212,7 +212,7 @@ func.func @test_2(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.ptr<!cc.struct<{i
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_3(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_3(%arg0 : !cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) {
   %0 = cc.extract_value %arg0[0] : (!cc.struct<{!cc.stdvec<i16>, !cc.stdvec<f32>}>) -> !cc.stdvec<i16>
   %1 = cc.stdvec_data %0 : (!cc.stdvec<i16>) -> !cc.ptr<i16>
   %2 = cc.stdvec_size %0 : (!cc.stdvec<i16>) -> i64
@@ -489,3 +489,4 @@ func.func @test_3(%0: !cc.ptr<i8>, %1: !cc.ptr<!cc.struct<{!cc.struct<{!cc.ptr<i
 // CHECK:         tail call void @cudaqRegisterArgsCreator(i8* nonnull getelementptr inbounds ([7 x i8], [7 x i8]* @test_3.kernelName, i64 0, i64 0), i8* nonnull bitcast (i64 (i8**, i8**)* @test_3.argsCreator to i8*))
 // CHECK:         ret void
 // CHECK:       }
+
diff --git a/test/Translate/return_values.qke b/test/Translate/return_values.qke
index 438f243a745..102bf049c0b 100644
--- a/test/Translate/return_values.qke
+++ b/test/Translate/return_values.qke
@@ -23,7 +23,7 @@ module attributes{ quake.mangled_name_map = {
 func.func private @__nvqpp_vectorCopyCtor(%arg0: !cc.ptr<i8> , %arg1: i64 , %arg2: i64 ) -> !cc.ptr<i8>
 
 // vector<bool> -> struct ptr sret
-func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> {
   %c1_i64 = arith.constant 1 : i64
   %c1 = arith.constant 1 : i64
   %c0 = arith.constant 0 : i64
@@ -46,8 +46,8 @@ func.func @__nvqpp__mlirgen__test_0(%arg0: i32) -> !cc.stdvec<i1> attributes {"c
     %12 = arith.addi %arg1, %c1 : i64
     cc.continue %12 : i64
   } {invariant}
-  %measOut = quake.mz %3 : (!quake.veq<?>) -> !quake.measurements<?>
-  %7 = quake.discriminate %measOut : (!quake.measurements<?>) -> !cc.stdvec<i1>
+  %measOut = quake.mz %3 : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+  %7 = quake.discriminate %measOut : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
   %8 = cc.stdvec_data %7 : (!cc.stdvec<i1>) -> !cc.ptr<i8>
   %9 = cc.stdvec_size %7 : (!cc.stdvec<i1>) -> i64
   %10 = call @__nvqpp_vectorCopyCtor(%8, %9, %c1_i64) : (!cc.ptr<i8>, i64, i64) -> !cc.ptr<i8>
@@ -61,47 +61,46 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 
 // CHECK-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__test_0(i32 
 // CHECK-SAME:                                                    %[[VAL_0:.*]]) local_unnamed_addr {
-// CHECK:         %[[VAL_0:.*]] = sext i32
-// CHECK:         %[[VAL_1:.*]] to i64
-// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_0]])
+// CHECK:         %[[VAL_1:.*]] = sext i32 %[[VAL_0]] to i64
+// CHECK:         %[[VAL_2:.*]] = tail call %[[VAL_3:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_1]])
 // CHECK:         %[[VAL_4:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_3]]* %[[VAL_2]])
 // CHECK:         %[[VAL_5:.*]] = icmp sgt i64 %[[VAL_4]], 0
 // CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       ._crit_edge.thread:
-// CHECK:         %[[VAL_15:.*]] = alloca i8, i64 %[[VAL_4]], align 1
-// CHECK:         br label %[[VAL_33:.*]]
-// CHECK:       .lr.ph:                                           ; preds = %[[VAL_8:.*]], %[[VAL_6]]
-// CHECK:         %[[VAL_9:.*]] = phi i64 [ %[[VAL_10:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
-// CHECK:         %[[VAL_11:.*]] = tail call %[[VAL_12:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_9]])
-// CHECK:         %[[VAL_13:.*]] = load %[[VAL_12]]*, %[[VAL_12]]** %[[VAL_11]], align 8
-// CHECK:         tail call void @__quantum__qis__h(%[[VAL_12]]* %[[VAL_13]])
-// CHECK:         %[[VAL_10]] = add nuw nsw i64 %[[VAL_9]], 1
-// CHECK:         %[[VAL_14:.*]] = icmp eq i64 %[[VAL_10]], %[[VAL_4]]
-// CHECK:         br i1 %[[VAL_14]], label %[[VAL_16:.*]], label %[[VAL_6]]
-// CHECK:       ._crit_edge:
-// CHECK:         %[[VAL_17:.*]] = alloca i8, i64 %[[VAL_4]], align 1
-// CHECK:         br i1 %[[VAL_5]], label %[[VAL_18:.*]], label %[[VAL_33]]
-// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_16]], %[[VAL_18]]
-// CHECK:         %[[VAL_19:.*]] = phi i64 [ %[[VAL_20:.*]], %[[VAL_18]] ], [ 0, %[[VAL_16]] ]
-// CHECK:         %[[VAL_21:.*]] = tail call %[[VAL_12]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_19]])
-// CHECK:         %[[VAL_22:.*]] = load %[[VAL_12]]*, %[[VAL_12]]** %[[VAL_21]], align 8
-// CHECK:         %[[VAL_23:.*]] = tail call %[[VAL_24:.*]]* @__quantum__qis__mz(%[[VAL_12]]* %[[VAL_22]])
-// CHECK:         %[[VAL_25:.*]] = bitcast %[[VAL_24]]* %[[VAL_23]] to i1*
-// CHECK:         %[[VAL_26:.*]] = load i1, i1* %[[VAL_25]], align 1
-// CHECK:         %[[VAL_27:.*]] = getelementptr i8, i8* %[[VAL_17]], i64 %[[VAL_19]]
-// CHECK:         %[[VAL_28:.*]] = zext i1 %[[VAL_26]] to i8
-// CHECK:         store i8 %[[VAL_28]], i8* %[[VAL_27]], align 1
-// CHECK:         %[[VAL_20]] = add nuw nsw i64 %[[VAL_19]], 1
-// CHECK:         %[[VAL_29:.*]] = icmp eq i64 %[[VAL_20]], %[[VAL_4]]
-// CHECK:         br i1 %[[VAL_29]], label %[[VAL_33]], label %[[VAL_18]]
-// CHECK:       ._crit_edge5:
-// CHECK:         %[[VAL_30:.*]] = phi i8*
-// CHECK:         %[[VAL_43:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_30]], i64 %[[VAL_4]], i64 1)
-// CHECK:         %[[VAL_44:.*]] = bitcast i8* %[[VAL_43]] to i1*
-// CHECK:         %[[VAL_45:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_44]], 0
-// CHECK:         %[[VAL_46:.*]] = insertvalue { i1*, i64 } %[[VAL_45]], i64 %[[VAL_4]], 1
+// CHECK:       ._crit_edge.thread:                               ; preds = %[[VAL_8:.*]]
+// CHECK:         %[[VAL_9:.*]] = alloca i8, i64 %[[VAL_4]], align 1
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       .lr.ph:                                           ; preds = %[[VAL_8]], %[[VAL_6]]
+// CHECK:         %[[VAL_11:.*]] = phi i64 [ %[[VAL_12:.*]], %[[VAL_6]] ], [ 0, %[[VAL_8]] ]
+// CHECK:         %[[VAL_13:.*]] = tail call %[[VAL_14:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_11]])
+// CHECK:         %[[VAL_15:.*]] = load %[[VAL_14]]*, %[[VAL_14]]** %[[VAL_13]], align 8
+// CHECK:         tail call void @__quantum__qis__h(%[[VAL_14]]* %[[VAL_15]])
+// CHECK:         %[[VAL_12]] = add nuw nsw i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_12]], %[[VAL_4]]
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_6]]
+// CHECK:       ._crit_edge:                                      ; preds = %[[VAL_6]]
+// CHECK:         %[[VAL_18:.*]] = alloca i8, i64 %[[VAL_4]], align 1
+// CHECK:         br i1 %[[VAL_5]], label %[[VAL_19:.*]], label %[[VAL_10]]
+// CHECK:       .lr.ph4:                                          ; preds = %[[VAL_17]], %[[VAL_19]]
+// CHECK:         %[[VAL_20:.*]] = phi i64 [ %[[VAL_21:.*]], %[[VAL_19]] ], [ 0, %[[VAL_17]] ]
+// CHECK:         %[[VAL_22:.*]] = tail call %[[VAL_14]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_3]]* %[[VAL_2]], i64 %[[VAL_20]])
+// CHECK:         %[[VAL_23:.*]] = load %[[VAL_14]]*, %[[VAL_14]]** %[[VAL_22]], align 8
+// CHECK:         %[[VAL_24:.*]] = tail call %[[VAL_25:.*]]* @__quantum__qis__mz(%[[VAL_14]]* %[[VAL_23]])
+// CHECK:         %[[VAL_26:.*]] = bitcast %[[VAL_25]]* %[[VAL_24]] to i1*
+// CHECK:         %[[VAL_27:.*]] = load i1, i1* %[[VAL_26]], align 1
+// CHECK:         %[[VAL_28:.*]] = getelementptr i8, i8* %[[VAL_18]], i64 %[[VAL_20]]
+// CHECK:         %[[VAL_29:.*]] = zext i1 %[[VAL_27]] to i8
+// CHECK:         store i8 %[[VAL_29]], i8* %[[VAL_28]], align 1
+// CHECK:         %[[VAL_21]] = add nuw nsw i64 %[[VAL_20]], 1
+// CHECK:         %[[VAL_30:.*]] = icmp eq i64 %[[VAL_21]], %[[VAL_4]]
+// CHECK:         br i1 %[[VAL_30]], label %[[VAL_10]], label %[[VAL_19]]
+// CHECK:       ._crit_edge5:                                     ; preds = %[[VAL_19]], %[[VAL_7]], %[[VAL_17]]
+// CHECK:         %[[VAL_31:.*]] = phi i8* [ %[[VAL_9]], %[[VAL_7]] ], [ %[[VAL_18]], %[[VAL_17]] ], [ %[[VAL_18]], %[[VAL_19]] ]
+// CHECK:         %[[VAL_32:.*]] = call i8* @__nvqpp_vectorCopyCtor(i8* nonnull %[[VAL_31]], i64 %[[VAL_4]], i64 1)
+// CHECK:         %[[VAL_33:.*]] = bitcast i8* %[[VAL_32]] to i1*
+// CHECK:         %[[VAL_34:.*]] = insertvalue { i1*, i64 } undef, i1* %[[VAL_33]], 0
+// CHECK:         %[[VAL_35:.*]] = insertvalue { i1*, i64 } %[[VAL_34]], i64 %[[VAL_4]], 1
 // CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_3]]* %[[VAL_2]])
-// CHECK:         ret { i1*, i64 } %[[VAL_46]]
+// CHECK:         ret { i1*, i64 } %[[VAL_35]]
 // CHECK:       }
 
 // CHECK-LABEL: define void @test_0({ i8*, i8*, i8* }* sret({ i8*, i8*, i8* }) 
@@ -150,7 +149,7 @@ func.func @test_0(%1: !cc.ptr<!cc.struct<{!cc.ptr<i8>, !cc.ptr<i8>, !cc.ptr<i8>}
 // CHECK:       }
 
 // struct{bool, bool} -> i16
-func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_1() -> !cc.struct<{i1, i1}> {
   %qubits = quake.alloca !quake.veq<2>
   %q0 = quake.extract_ref %qubits[0] : (!quake.veq<2>) -> !quake.ref
   %q1 = quake.extract_ref %qubits[1] : (!quake.veq<2>) -> !quake.ref
@@ -212,7 +211,7 @@ func.func @test_1(%this: !cc.ptr<i8>) -> i16 {
 // CHECK:       }
 
 // struct{i16, f32, f64, i64} -> sret ptr
-func.func @__nvqpp__mlirgen__test_2() -> !cc.struct<{i16, f32, f64, i64}> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_2() -> !cc.struct<{i16, f32, f64, i64}> {
   %rv = cc.undef !cc.struct<{i16, f32, f64, i64}>
   %c1 = arith.constant 8 : i16
   %rv1 = cc.insert_value %rv[0], %c1 : (!cc.struct<{i16, f32, f64, i64}>, i16) -> !cc.struct<{i16, f32, f64, i64}>
@@ -256,7 +255,7 @@ func.func @test_2(%1: !cc.ptr<!cc.struct<{i16, f32, f64, i64}>> {llvm.sret = !cc
 
 
 // array<T x n> -> sret ptr
-func.func @__nvqpp__mlirgen__test_3() -> !cc.array<i64 x 5> attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_3() -> !cc.array<i64 x 5> {
   %rv = cc.undef !cc.array<i64 x 5>
   %c1 = arith.constant 5 : i64
   %rv1 = cc.insert_value %rv[0], %c1 : (!cc.array<i64 x 5>, i64) -> !cc.array<i64 x 5>
@@ -301,7 +300,7 @@ func.func @test_3(%1: !cc.ptr<!cc.array<i64 x 5>> {llvm.sret = !cc.array<i64 x 5
 // CHECK:       }
 
 // small struct (<= 128) -> { i64, f64 }
-func.func @__nvqpp__mlirgen__test_4() -> (i64, f64) attributes {"cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_4() -> (i64, f64) {
   %c1 = arith.constant 537892 : i64
   %c2 = arith.constant 94.2134 : f64
   return %c1, %c2 : i64, f64
@@ -336,7 +335,7 @@ func.func @test_4(%sret: !cc.ptr<!cc.struct<{i64, f64}>> {llvm.sret = !cc.struct
 // CHECK:         ret void
 // CHECK:       }
 
-func.func @__nvqpp__mlirgen__test_5() -> (i64, f64) attributes {no_this, "cudaq-entrypoint"} {
+func.func @__nvqpp__mlirgen__test_5() -> (i64, f64) attributes {no_this} {
   %c1 = arith.constant 537892 : i64
   %c2 = arith.constant 94.2134 : f64
   return %c1, %c2 : i64, f64
diff --git a/unittests/common/MeasureCountsTester.cpp b/unittests/common/MeasureCountsTester.cpp
index 0969ea43822..92ba31429ae 100644
--- a/unittests/common/MeasureCountsTester.cpp
+++ b/unittests/common/MeasureCountsTester.cpp
@@ -64,42 +64,3 @@ CUDAQ_TEST(MeasureCountsTester, checkMeasureCountsSerialize) {
 
   EXPECT_TRUE(mm == mc);
 }
-
-CUDAQ_TEST(MeasureResultTester, checkConstructors) {
-  static_assert(!std::is_default_constructible_v<cudaq::measure_result>);
-  static_assert(std::is_copy_constructible_v<cudaq::measure_result>);
-  static_assert(std::is_move_constructible_v<cudaq::measure_result>);
-  static_assert(!std::is_copy_assignable_v<cudaq::measure_result>);
-  static_assert(!std::is_move_assignable_v<cudaq::measure_result>);
-
-  cudaq::measure_result r1(int64_t(1));
-  EXPECT_EQ(static_cast<int>(r1), 1);
-  EXPECT_TRUE(static_cast<bool>(r1));
-
-  cudaq::measure_result r2(int64_t(0), int64_t(42));
-  EXPECT_EQ(static_cast<int>(r2), 0);
-  EXPECT_FALSE(static_cast<bool>(r2));
-  EXPECT_NEAR(static_cast<double>(r2), 0.0, 1e-9);
-
-  cudaq::measure_result r3(r1);
-  EXPECT_EQ(static_cast<int>(r3), 1);
-
-  cudaq::measure_result r4(std::move(r1));
-  EXPECT_EQ(static_cast<int>(r4), 1);
-}
-
-CUDAQ_TEST(MeasureResultTester, checkComparisons) {
-  cudaq::measure_result a(int64_t(1), int64_t(10));
-  cudaq::measure_result b(int64_t(1), int64_t(10));
-  cudaq::measure_result c(int64_t(0), int64_t(10));
-  cudaq::measure_result d(int64_t(1), int64_t(20));
-
-  EXPECT_TRUE(a == b);
-  EXPECT_TRUE(a != c);
-  EXPECT_TRUE(a != d);
-  EXPECT_TRUE(a == true);
-  EXPECT_TRUE(true == a);
-  EXPECT_TRUE(c == false);
-  EXPECT_TRUE(c != true);
-  EXPECT_TRUE(false != a);
-}
diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
index 2536bbcdf91..3ebaefb7733 100644
--- a/unittests/qir/NVQIRTester.cpp
+++ b/unittests/qir/NVQIRTester.cpp
@@ -105,7 +105,6 @@ Array *__quantum__rt__array_slice(Array *array, int32_t dim,
                                   int64_t range_end);
 Array *__quantum__rt__array_slice_1d(Array *array, int64_t range_start,
                                      int64_t range_step, int64_t range_end);
-Array *__quantum__rt__result_array_create_1d(int64_t count);
 }
 
 CUDAQ_TEST(NVQIRTester, checkSimple) {
@@ -126,18 +125,6 @@ CUDAQ_TEST(NVQIRTester, checkSimple) {
   __quantum__rt__finalize();
 }
 
-CUDAQ_TEST(NVQIRTester, checkResultArrayCreate) {
-  __quantum__rt__initialize(0, nullptr);
-
-  constexpr int64_t numResults = 3;
-  auto *arr = __quantum__rt__result_array_create_1d(numResults);
-  EXPECT_NE(arr, nullptr);
-  EXPECT_EQ(__quantum__rt__array_get_size_1d(arr), numResults);
-
-  __quantum__rt__array_release(arr);
-  __quantum__rt__finalize();
-}
-
 // Stim does not support many of the gates used in these tests.
 #ifndef CUDAQ_BACKEND_STIM
 

From 863db9041257ca82a17675a2f45fc82e172af2fb Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Sun, 19 Apr 2026 21:14:25 -0700
Subject: [PATCH 21/85] Fixing Krylov notebook failure (#4348)

Skipping identity terms when building the Pauli word and coefficient
lists passed to the Krylov kernel. Controlled exp_pauli does not handle
the identity terms. We add their contribution back when assembling the
Hamiltonian matrix.

Fixes
https://github.com/NVIDIA/cuda-quantum/actions/runs/24584888146/job/71904057326#step:5:1955

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 docs/sphinx/applications/python/krylov.ipynb | 34 +++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/docs/sphinx/applications/python/krylov.ipynb b/docs/sphinx/applications/python/krylov.ipynb
index 711eb9d2c28..66b95331e95 100644
--- a/docs/sphinx/applications/python/krylov.ipynb
+++ b/docs/sphinx/applications/python/krylov.ipynb
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "ba61665c-dc3b-4e43-b1cf-340855ea68fb",
    "metadata": {},
    "outputs": [],
@@ -100,7 +100,7 @@
       "[pyscf] Total number of orbitals =  2\n",
       "[pyscf] Total number of electrons =  2\n",
       "[pyscf] HF energy =  -1.116325564486115\n",
-      "[pyscf] Total R-CCSD energy =  -1.1371758844013342\n",
+      "[pyscf] Total R-CCSD energy =  -1.1371758844013327\n",
       "Ground state energy (classical simulation)=  (-1.1371757102406845+0j) , index=  3\n"
      ]
     }
@@ -167,17 +167,20 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[(-0.10647701149300526+0j), (0.17028010135220517+0j), (0.17028010135220514+0j), (-0.22004130022421745+0j), (-0.22004130022421745+0j), (0.1683359862516207+0j), (0.12020049071260122+0j), (0.1656068235817425+0j), (0.1656068235817425+0j), (0.12020049071260122+0j), (0.17407289249680213+0j), (-0.04540633286914128+0j), (0.04540633286914128+0j), (0.04540633286914128+0j), (-0.04540633286914128+0j)]\n",
-      "['IIII', 'ZIII', 'IZII', 'IIZI', 'IIIZ', 'ZZII', 'ZIZI', 'ZIIZ', 'IZZI', 'IZIZ', 'IIZZ', 'XXYY', 'XYYX', 'YXXY', 'YYXX']\n"
+      "[(0.17028010135220506+0j), (0.17028010135220503+0j), (-0.2200413002242175+0j), (-0.2200413002242175+0j), (0.1683359862516207+0j), (0.12020049071260122+0j), (0.1656068235817425+0j), (0.1656068235817425+0j), (0.12020049071260122+0j), (0.17407289249680213+0j), (-0.04540633286914128+0j), (0.04540633286914128+0j), (0.04540633286914128+0j), (-0.04540633286914128+0j)]\n",
+      "['ZIII', 'IZII', 'IIZI', 'IIIZ', 'ZZII', 'ZIZI', 'ZIIZ', 'IZZI', 'IZIZ', 'IIZZ', 'XXYY', 'XYYX', 'YXXY', 'YYXX']\n"
      ]
     }
    ],
    "source": [
-    "\n",
-    "# Collect coefficients from a spin operator so we can pass them to a kernel\n",
+    "# Collect coefficients from a spin operator so we can pass them to a kernel.\n",
+    "# The identity term is excluded. Its contribution is added back to the \n",
+    "# Hamiltonian matrix classically below.\n",
     "def term_coefficients(ham: cudaq.SpinOperator) -> list[complex]:\n",
     "    result = []\n",
     "    for term in ham:\n",
+    "        if term.is_identity():\n",
+    "            continue\n",
     "        result.append(term.evaluate_coefficient())\n",
     "    return result\n",
     "\n",
@@ -185,9 +188,10 @@
     "def term_words(ham: cudaq.SpinOperator) -> list[str]:\n",
     "    # Our kernel uses these words to apply exp_pauli to the entire state.\n",
     "    # we hence ensure that each pauli word covers the entire space.\n",
-    "    \n",
     "    result = []\n",
     "    for term in ham:\n",
+    "        if term.is_identity():\n",
+    "            continue\n",
     "        result.append(term.get_pauli_word(qubits_num))\n",
     "    return result\n",
     "\n",
@@ -195,6 +199,13 @@
     "coefficient = term_coefficients(hamiltonian)\n",
     "pauli_string = term_words(hamiltonian)\n",
     "\n",
+    "# Sum of identity-term coefficients\n",
+    "# The identity contributes `identity_coef * S` to the Hamiltonian matrix.\n",
+    "identity_coef = sum(\n",
+    "    term.evaluate_coefficient().real\n",
+    "    for term in hamiltonian\n",
+    "    if term.is_identity())\n",
+    "\n",
     "print(coefficient)\n",
     "print(pauli_string)"
    ]
@@ -365,7 +376,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "# Create the identity operator\n",
     "identity_op = cudaq.SpinOperator.from_word('I' * qubits_num)\n",
     "# Get the Pauli word and convert it to a list of integers\n",
@@ -423,7 +433,7 @@
     "        # 2 entry array that stores real and imaginary part of matrix element\n",
     "        tot_e = np.zeros(2)\n",
     "\n",
-    "        # Loops over the terms in the Hamiltonian, computing expectation values\n",
+    "        # Loops over the (non-identity) terms in the Hamiltonian, computing expectation values\n",
     "        for coef, word in zip(coefficient, pauli_string):\n",
     "            pauli_list = pauli_str(word, qubits_num)\n",
     "            \n",
@@ -441,8 +451,8 @@
     "            tot_e[0] += temp[0]\n",
     "            tot_e[1] += temp[1]\n",
     "\n",
-    "        # Sums real and imaginary totals to specify Hamiltonian entry\n",
-    "        ham_matrx[m, n] = tot_e[0] + tot_e[1] * 1j\n",
+    "        # Adds back the identity-term contribution.\n",
+    "        ham_matrx[m, n] = tot_e[0] + tot_e[1] * 1j + identity_coef * wf_overlap[m, n]\n",
     "        if n != m:\n",
     "            ham_matrx[n, m] = np.conj(ham_matrx[m, n])"
    ]
@@ -512,7 +522,7 @@
      "output_type": "stream",
      "text": [
       "Energy from QFD:\n",
-      "(-1.137176660753775-1.6945689273261445e-07j)\n"
+      "(-1.1359686811350462-4.497484607599205e-09j)\n"
      ]
     }
    ],

From c9d0f3bcbea16715401996cc612d2ef76835a918 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:53:17 +1000
Subject: [PATCH 22/85] [Python] Fix overload resolution for `state.from_data`
 with MPS tensors (#4351)

Fixed the `test_state_mps.py - AttributeError: 'list' object has no
attribute 'dtype'` errors in
https://github.com/NVIDIA/cuda-quantum/actions/runs/24624569814/job/72005503960#step:7:43857

The fix for the rest of the failure (`RuntimeError: invalid value`) will
come in a separate PR.

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/runtime/cudaq/algorithms/py_state.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index a1ff9c2cd02..d1099e692be 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -523,12 +523,24 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
       .def_static(
           "from_data",
           [&](nanobind::object data) {
+            // Reject Python sequences (list/tuple) overload — they should be
+            // dispatched to the vector overload below. In pybind11, py::buffer
+            // excluded lists; nanobind::object accepts anything, so we must
+            // guard explicitly.
+            if (nanobind::isinstance<nanobind::list>(data) ||
+                nanobind::isinstance<nanobind::tuple>(data))
+              throw nanobind::next_overload();
             return createStateFromPyBuffer(data, holder);
           },
           "Return a state from data.")
       .def_static(
           "from_data",
           [&holder](const std::vector<nanobind::object> &tensors) {
+            // Reject SimulationState::Tensor objects overload — they're handled
+            // by the next overload and don't have numpy/cupy buffer attributes.
+            if (!tensors.empty() &&
+                nanobind::isinstance<SimulationState::Tensor>(tensors[0]))
+              throw nanobind::next_overload();
             const bool isHostData =
                 tensors.empty() ||
                 !nanobind::hasattr(tensors[0], "__cuda_array_interface__");

From ca69d2d204032d726ea6aa8400eaec5788f096f7 Mon Sep 17 00:00:00 2001
From: "Adam T. Geller" <adgeller@nvidia.com>
Date: Mon, 20 Apr 2026 00:53:22 -0700
Subject: [PATCH 23/85] Directly invoke locally simulated python kernels with
 args (#4265)

This PR removes argument synthesis by default for Python kernels run on
the local simulator, instead directly invoking them with the arguments
(currently, by constructing a message buffer through `.argsCreator`
which is passed to the kernel's `thunk`). This only affects entry point
kernels.

Benefits:
1. This makes it unnecessary to recompile kernels for different
arguments in this setting, simplifying the `reuse_compiler_artifacts`
logic.
2. It aligns the python local simulation path more closely with C++,
where arguments are similarly not synthesized.
3. As a result of 1 and 2, it is a useful and important first step
towards an inter-launch caching strategy for python.

---------

Signed-off-by: Adam Geller <adgeller@nvidia.com>
Signed-off-by: Luca Mondada <luca@mondada.net>
Co-authored-by: Luca Mondada <luca@mondada.net>
---
 .../cudaq/platform/py_alt_launch_kernel.cpp   | 109 ++++++++++----
 .../cudaq/platform/py_alt_launch_kernel.h     |   3 +
 .../test_cpp_quantum_algorithm_module.cpp     |   3 +-
 python/tests/kernel/test_reuse_compiler.py    |  86 ++++++++++-
 python/utils/OpaqueArguments.h                |  24 ++++
 runtime/common/CompiledModule.cpp             |  25 +++-
 runtime/common/CompiledModule.h               |   8 +-
 runtime/common/ExecutionContext.cpp           |  77 ++++------
 runtime/common/ExecutionContext.h             |  19 ++-
 runtime/cudaq/platform/default/python/QPU.cpp | 135 ++++++++----------
 .../compiler/CompiledModuleHelper.cpp         |  14 +-
 11 files changed, 318 insertions(+), 185 deletions(-)

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index dd5b60c6823..50bc7578d79 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -54,6 +54,7 @@
 using namespace mlir;
 using namespace cudaq_internal::compiler;
 using cudaq::JitEngine;
+using cudaq::PackingStyle;
 
 static std::function<std::string()> getTransportLayer = []() -> std::string {
   throw std::runtime_error("binding for kernel launch is incomplete");
@@ -178,6 +179,7 @@ nanobind::args cudaq::simplifiedValidateInputArguments(nanobind::args &args) {
   return processed;
 }
 
+template <PackingStyle style>
 void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
                                        mlir::Type memberType,
                                        nanobind::object value) {
@@ -206,13 +208,19 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
           for (std::size_t i = 0; auto v : asList)
             (*values)[i++] = nanobind::cast<T>(v);
 
-          std::memcpy(((char *)data) + offset, values, 16);
+          // synthesis path: span {ptr, size_t}
+          // argsCreator path: std::vector<T> {ptr, ptr, ptr}
+          constexpr std::size_t copySize =
+              sizeof(std::conditional_t<style == PackingStyle::synthesis,
+                                        std::pair<char *, std::size_t>,
+                                        std::vector<T>>);
+          std::memcpy(((char *)data) + offset, values, copySize);
         };
 
         mlir::TypeSwitch<mlir::Type, void>(ty.getElementType())
             .Case([&](mlir::IntegerType type) {
               if (type.isInteger(1)) {
-                appendVectorValue(value, data, offset, char());
+                appendVectorValue(value, data, offset, BoolVecElem<style>{});
                 return;
               }
               appendVectorValue(value, data, offset, std::size_t());
@@ -223,6 +231,20 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
                 return;
               }
               appendVectorValue(value, data, offset, double());
+            })
+            .Case([&](cudaq::cc::StdvecType innerVecType) {
+              if constexpr (style == PackingStyle::synthesis) {
+                throw std::runtime_error(
+                    "Type not supported for custom struct in kernel.");
+              } else {
+                // Nested vector (e.g., list[list[int]]): delegate to
+                // handleVectorElements which handles the recursive case.
+                auto asList = nanobind::cast<nanobind::list>(value);
+                auto *values = handleVectorElements<PackingStyle::argsCreator>(
+                    innerVecType, asList);
+                std::memcpy(((char *)data) + offset, values,
+                            sizeof(std::vector<std::vector<std::size_t>>));
+              }
             });
       })
       .Default([&](mlir::Type ty) {
@@ -232,6 +254,7 @@ void cudaq::handleStructMemberVariable(void *data, std::size_t offset,
       });
 }
 
+template <PackingStyle style>
 void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
   auto appendValue = []<typename T>(nanobind::list list,
                                     auto &&converter) -> void * {
@@ -245,12 +268,13 @@ void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
 
   return llvm::TypeSwitch<mlir::Type, void *>(eleTy)
       .Case([&](mlir::IntegerType ty) {
-        if (ty.getIntOrFloatBitWidth() == 1)
-          return appendValue.template operator()<char>(
+        if (ty.getIntOrFloatBitWidth() == 1) {
+          return appendValue.template operator()<BoolVecElem<style>>(
               list, [](nanobind::handle v, std::size_t i) {
                 checkListElementType<nanobind::bool_>(v, i);
-                return nanobind::cast<bool>(v);
+                return static_cast<BoolVecElem<style>>(nanobind::cast<bool>(v));
               });
+        }
         if (ty.getIntOrFloatBitWidth() == 8)
           return appendValue.template operator()<std::int8_t>(
               list, [](nanobind::handle v, std::size_t i) {
@@ -313,7 +337,7 @@ void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
                                                 nanobind::list list) -> void * {
           auto *values = new std::vector<std::vector<T>>();
           for (std::size_t i = 0; i < list.size(); i++) {
-            auto ptr = handleVectorElements(eleTy, list[i]);
+            auto ptr = handleVectorElements<style>(eleTy, list[i]);
             auto *element = static_cast<std::vector<T> *>(ptr);
             values->emplace_back(std::move(*element));
           }
@@ -321,9 +345,11 @@ void *cudaq::handleVectorElements(mlir::Type eleTy, nanobind::list list) {
         };
 
         auto eleTy = ty.getElementType();
-        if (ty.getElementType().isInteger(1))
+        if (ty.getElementType().isInteger(1)) {
           // Special case for a `std::vector<bool>`.
-          return appendVectorValue.template operator()<char>(eleTy, list);
+          return appendVectorValue.template operator()<BoolVecElem<style>>(
+              eleTy, list);
+        }
 
         // All other `std::Vector<T>` types, including nested vectors.
         return appendVectorValue.template operator()<std::size_t>(eleTy, list);
@@ -344,6 +370,7 @@ std::string cudaq::mlirTypeToString(mlir::Type ty) {
   return msg;
 }
 
+template <PackingStyle style>
 void cudaq::packArgs(
     OpaqueArguments &argData, nanobind::list args,
     mlir::ArrayRef<mlir::Type> mlirTys,
@@ -387,7 +414,8 @@ void cudaq::packArgs(
         .Case([&](IntegerType ty) {
           if (ty.getIntOrFloatBitWidth() == 1) {
             checkArgumentType<nanobind::bool_>(arg, i);
-            addArgument(argData, static_cast<char>(nanobind::cast<bool>(arg)));
+            addArgument(argData, static_cast<BoolVecElem<style>>(
+                                     nanobind::cast<bool>(arg)));
             return;
           }
 
@@ -434,14 +462,18 @@ void cudaq::packArgs(
         })
         .Case([&](cc::StructType ty) {
           auto mod = kernelFuncOp->getParentOfType<mlir::ModuleOp>();
-          auto [size, offsets] = getTargetLayout(mod, ty);
+          cc::StructType layoutTy = ty;
+          if constexpr (style == PackingStyle::argsCreator)
+            layoutTy = cast<cc::StructType>(
+                cudaq::opt::factory::convertToHostSideType(ty, mod));
+          auto [size, offsets] = getTargetLayout(mod, layoutTy);
           auto memberTys = ty.getMembers();
           auto allocatedArg = std::malloc(size);
           if (ty.getName() == "tuple") {
             auto elements = nanobind::cast<nanobind::tuple>(arg);
             for (std::size_t i = 0; i < offsets.size(); i++)
-              handleStructMemberVariable(allocatedArg, offsets[i], memberTys[i],
-                                         elements[i]);
+              handleStructMemberVariable<style>(allocatedArg, offsets[i],
+                                                memberTys[i], elements[i]);
           } else {
             nanobind::dict attributes =
                 nanobind::cast<nanobind::dict>(arg.attr("__annotations__"));
@@ -449,8 +481,8 @@ void cudaq::packArgs(
                  const auto &[attr_name, unused] : attributes) {
               nanobind::object attr_value =
                   arg.attr(nanobind::cast<std::string>(attr_name).c_str());
-              handleStructMemberVariable(allocatedArg, offsets[i], memberTys[i],
-                                         attr_value);
+              handleStructMemberVariable<style>(allocatedArg, offsets[i],
+                                                memberTys[i], attr_value);
               i++;
             }
           }
@@ -459,7 +491,7 @@ void cudaq::packArgs(
         .Case([&](cc::StdvecType ty) {
           auto appendVectorValue = [&argData]<typename T>(Type eleTy,
                                                           nanobind::list list) {
-            auto allocatedArg = handleVectorElements(eleTy, list);
+            auto allocatedArg = handleVectorElements<style>(eleTy, list);
             argData.emplace_back(allocatedArg, [](void *ptr) {
               delete static_cast<std::vector<T> *>(ptr);
             });
@@ -470,7 +502,8 @@ void cudaq::packArgs(
           auto eleTy = ty.getElementType();
           if (eleTy.isInteger(1)) {
             // Special case for a `std::vector<bool>`.
-            appendVectorValue.template operator()<char>(eleTy, list);
+            appendVectorValue.template operator()<BoolVecElem<style>>(eleTy,
+                                                                      list);
             return;
           }
           // All other `std::vector<T>` types, including nested vectors.
@@ -516,8 +549,8 @@ void cudaq::packArgs(
               if (startLiftedArgs) {
                 auto fnTy = calledFuncOp.getFunctionType();
                 auto liftedTys = fnTy.getInputs().drop_front(*startLiftedArgs);
-                packArgs(resolvedArgs, arguments, liftedTys, backupHandler,
-                         calledFuncOp);
+                packArgs<style>(resolvedArgs, arguments, liftedTys,
+                                backupHandler, calledFuncOp);
               }
               return new runtime::CallableClosureArgument(
                   kernelName, kernelModule, std::move(startLiftedArgs),
@@ -541,6 +574,7 @@ void cudaq::packArgs(
   }
 }
 
+template <PackingStyle style>
 void cudaq::packArgs(
     OpaqueArguments &argData, nanobind::args args,
     mlir::func::FuncOp kernelFuncOp,
@@ -566,7 +600,7 @@ void cudaq::packArgs(
       continue;
     pyList.append(h);
   }
-  return packArgs(
+  return packArgs<style>(
       argData, pyList,
       kernelFuncOp.getFunctionType().getInputs().drop_front(startingArgIdx),
       backupHandler, kernelFuncOp);
@@ -717,17 +751,27 @@ nanobind::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         return readPyObject<float>(ty, data);
       })
       .Case([&](cudaq::cc::StdvecType ty) -> nanobind::object {
+        auto eleTy = ty.getElementType();
+        // Nested StdvecType elements have a different in-memory size than
+        // scalar types: span ({ptr,size_t} = 16 bytes) in direct-call context,
+        // std::vector ({ptr,ptr,ptr} = 24 bytes) in run context.
+        auto getEleByteSize = [&](Type eTy) -> std::size_t {
+          if (isa<cudaq::cc::StdvecType>(eTy))
+            return isRunContext ? 3 * sizeof(void *)
+                                : sizeof(char *) + sizeof(std::size_t);
+          return byteSize(eTy);
+        };
+
         if (isRunContext) {
           // cudaq.run return.
-          auto eleTy = ty.getElementType();
-          auto eleByteSize = byteSize(eleTy);
+          auto eleByteSize = getEleByteSize(eleTy);
 
           // Vector of booleans has a special layout.
           // Read the vector and create a list of booleans.
           // Note: in the `cudaq::run` context the `std::vector<bool>` is
           // constructed in the host runtime by parsing the output log to
           // `std::vector<bool>`.
-          if (eleTy.getIntOrFloatBitWidth() == 1) {
+          if (eleTy.isInteger(1)) {
             auto v = reinterpret_cast<std::vector<bool> *>(data);
             nanobind::list list;
             for (auto const bit : *v)
@@ -752,8 +796,7 @@ nanobind::object cudaq::convertResult(ModuleOp module, Type ty, char *data) {
         }
 
         // Direct call return.
-        auto eleTy = ty.getElementType();
-        auto eleByteSize = byteSize(eleTy);
+        auto eleByteSize = getEleByteSize(eleTy);
 
         // Vector is a span: `{ data, length }`.
         // Read `data` and `length` from the buffer.
@@ -854,11 +897,18 @@ cudaq::OpaqueArguments cudaq::marshal_arguments_for_module_launch(
     ModuleOp mod, nanobind::args runtimeArgs, func::FuncOp kernelFunc) {
   // Convert python arguments to opaque form.
   cudaq::OpaqueArguments args;
-  cudaq::packArgs(
-      args, runtimeArgs, kernelFunc,
-      [&](cudaq::OpaqueArguments &args, nanobind::object &pyArg, unsigned pos) {
-        return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
-      });
+  bool isLocalSimulator =
+      !(cudaq::is_remote_platform() || cudaq::is_emulated_platform());
+  auto handler = [&](cudaq::OpaqueArguments &args, nanobind::object &pyArg,
+                     unsigned pos) {
+    return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
+  };
+  if (isLocalSimulator)
+    cudaq::packArgs<PackingStyle::argsCreator>(args, runtimeArgs, kernelFunc,
+                                               handler);
+  else
+    cudaq::packArgs<PackingStyle::synthesis>(args, runtimeArgs, kernelFunc,
+                                             handler);
   return args;
 }
 
@@ -871,7 +921,6 @@ nanobind::object cudaq::marshal_and_launch_module(const std::string &name,
   Type retTy = cudaq::runtime::getReturnType(kernelFunc);
   auto args = marshal_arguments_for_module_launch(mod, runtimeArgs, kernelFunc);
   [[maybe_unused]] auto resultPtr = clean_launch_module(name, mod, args);
-  // FIXME: handle dynamic sized results!
 
   if (!retTy)
     return nanobind::none();
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.h b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
index 8e1cc9a98cb..1a94c11bbbe 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.h
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.h
@@ -57,6 +57,9 @@ KernelThunkResultType clean_launch_module(const std::string &kernelName,
                                           mlir::ModuleOp mod,
                                           OpaqueArguments &args);
 
+/// Marshal python arguments into an OpaqueArguments for kernel launch.
+/// Encodes arguments in the runtime ABI layout for direct local simulation,
+/// and the synthesis-pass layout for all other targets.
 OpaqueArguments
 marshal_arguments_for_module_launch(mlir::ModuleOp mod,
                                     nanobind::args runtimeArgs,
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index f098e35d824..06e29e01b55 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -38,8 +38,7 @@ NB_MODULE(cudaq_test_cpp_algo, m) {
         // the `decorator` must remain alive for `entryPoint` to be valid.
         cudaq::python::CppPyKernelDecorator decorator(qern);
         auto entryPoint =
-            decorator
-                .getEntryPointFunction<cudaq::qkernel<void(std::size_t)>>();
+            decorator.getDirectKernelCall<cudaq::qkernel<void(std::size_t)>>();
         marshal_test(std::move(entryPoint), qnum);
       },
       "");
diff --git a/python/tests/kernel/test_reuse_compiler.py b/python/tests/kernel/test_reuse_compiler.py
index 5a0f28336fe..5bb1c63bdc1 100644
--- a/python/tests/kernel/test_reuse_compiler.py
+++ b/python/tests/kernel/test_reuse_compiler.py
@@ -39,8 +39,8 @@ def nop(numQubits: int):
         res = cudaq.sample(simple, 4, shots_count=1)
         assert (res.count("1111") == 1)
 
-        with pytest.raises(RuntimeError):
-            res = cudaq.sample(simple, 5, shots_count=1)
+        res = cudaq.sample(simple, 5, shots_count=1)
+        assert (res.count("11111") == 1)
 
         @cudaq.kernel
         def simple(numQubits: int):
@@ -51,11 +51,29 @@ def simple(numQubits: int):
 
         simple = nop
         with pytest.raises(RuntimeError):
-            res = cudaq.sample(simple, 5, shots_count=1)
+            res = cudaq.sample(simple, 4, shots_count=1)
     res = cudaq.sample(simple, 6, shots_count=1)
     assert (res.count("000000") == 1)
 
 
+def test_reuse_different_kernel_raises():
+    """Calling a different kernel inside reuse_compiler_artifacts should raise."""
+
+    @cudaq.kernel
+    def kernel_a(numQubits: int):
+        qubits = cudaq.qvector(numQubits)
+        x(qubits.front())
+
+    @cudaq.kernel
+    def kernel_b(numQubits: int):
+        qubits = cudaq.qvector(numQubits)
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        cudaq.sample(kernel_a, 2, shots_count=1)
+        with pytest.raises(RuntimeError):
+            cudaq.sample(kernel_b, 2, shots_count=1)
+
+
 def test_reuse_no_arguments():
     """A no-arg kernel should be reusable in artifact-reuse mode."""
 
@@ -119,8 +137,7 @@ def apply_complex_angles(angles: list[complex]):
                            same_angles_different_value,
                            shots_count=1)
         assert (res.count("11") == 1)
-        with pytest.raises(RuntimeError):
-            cudaq.sample(apply_complex_angles, different_angles, shots_count=1)
+        cudaq.sample(apply_complex_angles, different_angles, shots_count=1)
 
 
 def test_different_launch_mode():
@@ -159,3 +176,62 @@ def test_reuse_of_builder():
     with cudaq.cudaq_runtime.reuse_compiler_artifacts():
         cudaq.sample(kernel, 5, shots_count=1)
         cudaq.sample(kernel, 5, shots_count=1)
+
+
+def test_reuse_with_result_no_args():
+
+    @cudaq.kernel
+    def flip() -> bool:
+        q = cudaq.qubit()
+        x(q)
+        return mz(q)
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        result = flip()
+        assert result == True
+        result = flip()  # cached kernel
+        assert result == True
+
+
+def test_reuse_with_result_and_args():
+
+    @cudaq.kernel
+    def count_ones(n: int) -> int:
+        qubits = cudaq.qvector(n)
+        for qubit in qubits:
+            x(qubit)
+        result = 0
+        for i in range(n):
+            if mz(qubits[i]):
+                result += 1
+        return result
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        result = count_ones(3)
+        assert result == 3
+        result = count_ones(3)  # cached kernel, same arg
+        assert result == 3
+        result = count_ones(4)  # cached kernel, different arg
+        assert result == 4
+
+
+def test_reuse_via_run_with_result():
+
+    @cudaq.kernel
+    def count_ones(n: int) -> int:
+        qubits = cudaq.qvector(n)
+        for qubit in qubits:
+            x(qubit)
+        result = 0
+        for i in range(n):
+            if mz(qubits[i]):
+                result += 1
+        return result
+
+    with cudaq.cudaq_runtime.reuse_compiler_artifacts():
+        results = cudaq.run(count_ones, 3, shots_count=2)
+        assert len(results) == 2
+        assert all(r == 3 for r in results)
+        results = cudaq.run(count_ones, 4, shots_count=2)  # cached kernel
+        assert len(results) == 2
+        assert all(r == 4 for r in results)
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index 3180b1a52e0..b81577bbae2 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -112,18 +112,41 @@ void valueArgument(OpaqueArguments &argData, T *arg) {
 
 std::string mlirTypeToString(mlir::Type ty);
 
+/// Controls how `packArgs` and its helpers lay out argument data in memory.
+enum class PackingStyle : bool {
+  /// Direct-launch path: values are placed into a message buffer passed
+  /// directly to the generated `.thunk` at runtime. The encoding must match
+  /// the ABI the thunk expects exactly.
+  argsCreator = false,
+  /// Synthesis path (default): values are consumed by the MLIR
+  /// argument-synthesis pass (`ArgumentConverter`), which substitutes them as
+  /// constants into the kernel IR before JIT compilation. The exact in-memory
+  /// layout is not observable at runtime, so a simpler encoding is used.
+  synthesis = true,
+};
+
+/// Maps a PackingStyle to the element type used to store boolean values in
+/// vectors: synthesis uses `char` (span-compatible), argsCreator uses `bool`.
+template <PackingStyle style>
+using BoolVecElem =
+    std::conditional_t<style == PackingStyle::synthesis, char, bool>;
+
 /// For the current struct member variable type, insert the value into the
 /// dynamically constructed struct.
+template <PackingStyle style = PackingStyle::synthesis>
 void handleStructMemberVariable(void *data, std::size_t offset,
                                 mlir::Type memberType, nanobind::object value);
 
 /// For the current vector element type, insert the value into the dynamically
 /// constructed vector.
+template <PackingStyle style = PackingStyle::synthesis>
 void *handleVectorElements(mlir::Type eleTy, nanobind::list list);
 
 /// Take a list of python objects (the arguments) and convert them to C++
 /// objects on the heap. The results are returned in \p argData and include
 /// special `deletors` so that the argument data is cleaned up correctly.
+/// See \p PackingStyle for the two encoding modes.
+template <PackingStyle style = PackingStyle::synthesis>
 void packArgs(OpaqueArguments &argData, nanobind::list args,
               mlir::ArrayRef<mlir::Type> mlirTys,
               const std::function<bool(OpaqueArguments &, nanobind::object &,
@@ -132,6 +155,7 @@ void packArgs(OpaqueArguments &argData, nanobind::list args,
 
 /// This overload handles dropping the front \p startingArgIdx arguments on the
 /// floor. They are not packed in \p argData and are simply ignored.
+template <PackingStyle style = PackingStyle::synthesis>
 void packArgs(OpaqueArguments &argData, nanobind::args args,
               mlir::func::FuncOp kernelFuncOp,
               const std::function<bool(OpaqueArguments &, nanobind::object &,
diff --git a/runtime/common/CompiledModule.cpp b/runtime/common/CompiledModule.cpp
index cefbb5473fd..83b45c8d197 100644
--- a/runtime/common/CompiledModule.cpp
+++ b/runtime/common/CompiledModule.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "CompiledModule.h"
+#include <cstring>
 #include <memory>
 #include <stdexcept>
 
@@ -60,21 +61,31 @@ cudaq::KernelThunkResultType
 cudaq::CompiledModule::execute(const std::vector<void *> &rawArgs) const {
   auto &jit = getJit();
   auto funcPtr = jit.entryPoint;
-  if (resultInfo.hasResult()) {
-    void *buff = const_cast<void *>(rawArgs.back());
-    return reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
-        buff, /*client_server=*/false);
-  }
   if (!isFullySpecialized()) {
+    // Pack args at runtime via argsCreator, then call the thunk.
     void *buff = nullptr;
     jit.argsCreator(static_cast<const void *>(rawArgs.data()), &buff);
     reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
         buff, /*client_server=*/false);
+    // If the kernel has a result, copy it from the packed buffer into
+    // rawArgs.back() (where the caller expects to find it).
+    if (resultInfo.hasResult()) {
+      auto offset = jit.returnOffset();
+      std::memcpy(rawArgs.back(), static_cast<char *>(buff) + offset,
+                  resultInfo.bufferSize);
+    }
     std::free(buff);
     return {nullptr, 0};
   }
-
-  funcPtr();
+  if (resultInfo.hasResult()) {
+    // Fully specialized with result: rawArgs.back() is the pre-allocated
+    // result buffer; pass it directly to the thunk.
+    void *buff = const_cast<void *>(rawArgs.back());
+    return reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
+        buff, /*client_server=*/false);
+  }
+  // Fully specialized, no result.
+  jit.entryPoint();
   return {nullptr, 0};
 }
 
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index 9837e81af26..430e4d58f86 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -105,13 +105,19 @@ class CompiledModule {
   class JitArtifact {
     JitEngine engine;
     void (*entryPoint)() = nullptr;
-    int64_t (*argsCreator)(const void *, void **) = nullptr;
+    std::int64_t (*argsCreator)(const void *, void **) = nullptr;
+    /// Offset (in bytes) of the result field within the argsCreator-packed
+    /// buffer. Only valid when argsCreator is non-null and the kernel has a
+    /// result. Use resultInfo.bufferSize to know how many bytes to copy.
+    std::int64_t (*returnOffset)() = nullptr;
     std::optional<Resources> resourceCounts;
 
     JitArtifact(JitEngine engine, void (*entryPoint)(),
                 int64_t (*argsCreator)(const void *, void **),
+                int64_t (*returnOffset)(),
                 std::optional<Resources> resourceCounts)
         : engine(engine), entryPoint(entryPoint), argsCreator(argsCreator),
+          returnOffset(returnOffset),
           resourceCounts(std::move(resourceCounts)) {}
 
     friend class CompiledModule;
diff --git a/runtime/common/ExecutionContext.cpp b/runtime/common/ExecutionContext.cpp
index 3e163ed0f6f..48ca8d9dc65 100644
--- a/runtime/common/ExecutionContext.cpp
+++ b/runtime/common/ExecutionContext.cpp
@@ -27,56 +27,39 @@ thread_local bool reuseArtifact = false;
 
 class SavedCompilerArtifact {
 public:
-  void saveArtifact(const std::string &kernelName,
-                    const std::vector<void *> &args, const JitEngine &engine,
-                    std::function<void *()> argsCreatorThunk) {
+  void saveArtifact(const std::string &kernelName, const JitEngine engine) {
     if (jitEng.has_value()) {
       throw std::runtime_error(
           "Attempted to overwrite saved compiler artifact.");
     }
     jitEng = engine;
-    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
-        argsCreatorThunk());
     this->kernelName = kernelName;
-    auto [resSize, scopedArgBuffer] = processArgs(args);
-    argSize = resSize;
-    argBuff = std::move(scopedArgBuffer);
   }
 
   void checkArtifactReuse(const std::string &kernelName,
-                          const std::vector<void *> &args,
-                          const JitEngine &engine,
-                          std::function<void *()> argsCreatorThunk) {
+                          const JitEngine engine) {
     if (!jitEng.has_value()) {
-      saveArtifact(kernelName, args, engine, argsCreatorThunk);
+      saveArtifact(kernelName, engine);
       return;
     }
 
     if (kernelName != this->kernelName)
       throw std::runtime_error("Detected reuse of compiler artifact with "
                                "a different kernel.");
+  }
 
-    auto [resSize, scopedArgBuffer] = processArgs(args);
-
-    auto validate = [this, resSize, &scopedArgBuffer]() {
-      if (resSize != this->argSize)
-        return false;
-      return memcmp(this->argBuff.get(), scopedArgBuffer.get(), resSize) == 0;
-    };
+  void reset() { jitEng.reset(); }
 
-    if (!validate())
+  std::optional<JitEngine> getArtifactJit(const std::string &kernelName) {
+    if (!jitEng.has_value())
+      return std::nullopt;
+    if (kernelName != this->kernelName)
       throw std::runtime_error("Detected reuse of compiler artifact with "
-                               "diverging explicit arguments.");
-  }
-
-  void reset() {
-    jitEng.reset();
-    argsCreator = nullptr;
-    argBuff.reset();
-    argSize = 0;
+                               "a different kernel.");
+    return jitEng;
   }
 
-  SavedCompilerArtifact() : argBuff(nullptr, free) {}
+  SavedCompilerArtifact() {}
 
   void saveEngineForReuse(ExecutionContext *ctx) {
     if (!reuseArtifact || !ctx)
@@ -89,9 +72,13 @@ class SavedCompilerArtifact {
     if (!reuseArtifact || !ctx || !jitEng.has_value())
       return;
 
-    if (launchMode != ctx->name)
+    // Allow launchMode == "" when the artifact was saved before any execution
+    // context was set (e.g., via precompile_module). In that case, accept any
+    // context and record the mode for future checks.
+    if (!launchMode.empty() && launchMode != ctx->name)
       throw std::runtime_error(
           "Detected reuse of compiler artifact with different launch mode");
+    launchMode = ctx->name;
     ctx->jitEng = jitEng.value();
   }
 
@@ -99,20 +86,8 @@ class SavedCompilerArtifact {
   std::optional<JitEngine> jitEng = std::nullopt;
   // This is actually going to be a pointer into the jitEng,
   // but we have to store it explicitly due to linking issues.
-  int64_t (*argsCreator)(const void *, void **);
   std::string kernelName;
   std::string launchMode;
-  std::unique_ptr<void, decltype(&free)> argBuff;
-  size_t argSize = 0;
-
-  std::tuple<size_t, std::unique_ptr<void, decltype(&free)>>
-  processArgs(const std::vector<void *> &args) {
-    assert(jitEng.has_value());
-    void *resBuffer;
-    auto resSize = argsCreator(args.data(), &resBuffer);
-    std::unique_ptr<void, decltype(&free)> scopedArgBuffer(resBuffer, free);
-    return std::tuple(resSize, std::move(scopedArgBuffer));
-  }
 };
 
 thread_local SavedCompilerArtifact savedArtifact;
@@ -128,22 +103,24 @@ void disablePersistentJITEngine() {
 
 bool isPersistingJITEngine() { return reuseArtifact; }
 
-void checkArtifactReuse(const std::string kernelName,
-                        const std::vector<void *> &args, const JitEngine jit,
-                        std::function<void *()> argsCreatorThunk) {
+void checkArtifactReuse(const std::string kernelName, const JitEngine jit) {
   if (!reuseArtifact)
     return;
 
-  savedArtifact.checkArtifactReuse(kernelName, args, jit, argsCreatorThunk);
+  savedArtifact.checkArtifactReuse(kernelName, jit);
 }
 
-void saveArtifact(const std::string kernelName, const std::vector<void *> &args,
-                  const JitEngine jit,
-                  std::function<void *()> argsCreatorThunk) {
+void saveArtifact(const std::string kernelName, const JitEngine jit) {
   if (!reuseArtifact)
     return;
 
-  savedArtifact.saveArtifact(kernelName, args, jit, argsCreatorThunk);
+  savedArtifact.saveArtifact(kernelName, jit);
+}
+
+std::optional<JitEngine> getArtifactJit(const std::string &kernelName) {
+  if (!reuseArtifact)
+    return std::nullopt;
+  return savedArtifact.getArtifactJit(kernelName);
 }
 } // namespace compiler_artifact
 
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
index 15ee2e973e2..f0e26ad4651 100644
--- a/runtime/common/ExecutionContext.h
+++ b/runtime/common/ExecutionContext.h
@@ -218,17 +218,16 @@ void enablePersistentJITEngine();
 void disablePersistentJITEngine();
 bool isPersistingJITEngine();
 
-/// Checks that the compiler artifact (if present) can be reused
-/// for the given explicit launch arguments.
-///
-/// `argsCreatorPtr` must point to the `.argsCreator` function from `jit`
+/// Checks that the compiler artifact (if present) can be reused for the
+/// given kernel. Throws if a different kernel name was previously saved.
 void checkArtifactReuse(const std::string kernelName,
-                        const std::vector<void *> &args,
-                        const cudaq::JitEngine jit,
-                        std::function<void *()> argsCreatorThunk);
+                        const cudaq::JitEngine jit);
+
+void saveArtifact(const std::string kernelName, const cudaq::JitEngine jit);
 
-void saveArtifact(const std::string kernelName, const std::vector<void *> &args,
-                  const cudaq::JitEngine jit,
-                  std::function<void *()> argsCreatorThunk);
+/// Returns the saved JIT engine if one is present for \p kernelName.
+/// Throws if a different kernel name was previously saved.
+/// Returns std::nullopt if no artifact has been saved yet.
+std::optional<JitEngine> getArtifactJit(const std::string &kernelName);
 }; // namespace compiler_artifact
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 559d0360892..7146dd6f70d 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -30,7 +30,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/Passes.h"
-#include <unordered_set>
+#include <cudaq/Optimizer/CodeGen/QIROpaqueStructTypes.h>
 
 // Declared in runtime/cudaq/algorithms/resource_estimation.h (not included
 // here to avoid pulling in cudaq/platform.h which creates circular deps).
@@ -42,17 +42,16 @@ using namespace mlir;
 using namespace cudaq_internal::compiler;
 using cudaq::JitEngine;
 
-static void
-specializeKernel(const std::string &name, ModuleOp module,
-                 const std::vector<void *> &rawArgs, Type resultTy = {},
-                 bool enablePythonCodegenDump = false, bool isEntryPoint = true,
-                 const std::unordered_set<unsigned> &varArgIndices = {}) {
+static void specializeKernel(const std::string &name, ModuleOp module,
+                             const std::vector<void *> &rawArgs,
+                             Type resultTy = {},
+                             bool enablePythonCodegenDump = false,
+                             bool isEntryPoint = true,
+                             bool isFullySpecialized = true) {
   PassManager pm(module.getContext());
   ArgumentConverter argCon(name, module);
-  if (varArgIndices.empty())
-    argCon.gen(name, module, rawArgs);
-  else
-    argCon.gen(rawArgs, varArgIndices);
+  // Look up the kernel's type signature.
+  argCon.gen(name, module, rawArgs);
   SmallVector<std::string> kernels;
   SmallVector<std::string> substs;
   for (auto *kInfo : argCon.getKernelSubstitutions()) {
@@ -82,25 +81,11 @@ specializeKernel(const std::string &name, ModuleOp module,
   cudaq::opt::addAggressiveInlining(pm);
   pm.addPass(cudaq::opt::createDistributedDeviceCall());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  // If we're persisting the jit cache we need to run GKE to have access
-  // to `.argsCreator` to serialize the arguments.
-  if (!varArgIndices.empty()) {
-    pm.addPass(
-        cudaq::opt::createGenerateKernelExecution({.positNullary = false}));
-  } else if ((resultTy && isEntryPoint) ||
-             cudaq::compiler_artifact::isPersistingJITEngine()) {
-    // If we're expecting a result, then we want to call the .thunk function so
-    // that the result is properly marshaled. Add the GKE pass to generate the
-    // .thunk. At this point, the kernel should have been specialized so it has
-    // an arity of 0.
-    auto nullary = true;
-    for (auto arg : rawArgs)
-      if (!arg) {
-        nullary = false;
-        break;
-      }
+  // Run GKE to generate `.thunk` / `.argsCreator` when the kernel has a result
+  // or any unspecialized arguments so they can be properly marshaled
+  if (isEntryPoint && (resultTy || !isFullySpecialized)) {
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
-        {.positNullary = nullary, .ignoreHostFunction = true}));
+        {.positNullary = isFullySpecialized, .ignoreHostFunction = true}));
   }
   pm.addPass(createSymbolDCEPass());
   if (enablePythonCodegenDump) {
@@ -272,25 +257,21 @@ static std::optional<JitEngine>
 alreadyBuiltJITCode(const std::string &name,
                     const std::vector<void *> &rawArgs) {
   auto *currentExecCtx = cudaq::getExecutionContext();
-  if (!currentExecCtx || !currentExecCtx->allowJitEngineCaching)
-    return std::nullopt;
-
-  auto jit = currentExecCtx->jitEng;
-  if (jit && cudaq::compiler_artifact::isPersistingJITEngine()) {
-    CUDAQ_INFO("Loading previously compiled JIT engine for {}. This will "
-               "re-run the previous job, discarding any changes to the kernel, "
-               "arguments or launch configuration.",
-               currentExecCtx->kernelName);
-
-    // Ensure the arguments are the same as the previous launch.
-    auto argsCreatorThunk = [&jit, &name]() {
-      return (void *)jit->lookupRawNameOrFail(name + ".argsCreator");
-    };
-    cudaq::compiler_artifact::checkArtifactReuse(name, rawArgs, jit.value(),
-                                                 argsCreatorThunk);
+  if (currentExecCtx && currentExecCtx->allowJitEngineCaching) {
+    auto jit = currentExecCtx->jitEng;
+    if (jit && cudaq::compiler_artifact::isPersistingJITEngine()) {
+      CUDAQ_INFO("Loading previously compiled JIT engine for {}. This will "
+                 "re-run the previous job, discarding any changes to the "
+                 "kernel, arguments or launch configuration.",
+                 currentExecCtx->kernelName);
+      cudaq::compiler_artifact::checkArtifactReuse(name, jit.value());
+    }
+    return jit;
   }
 
-  return jit;
+  // Fallback for callers without an ExecutionContext (e.g. direct kernel
+  // calls): look up the artifact saved by a previous compilation.
+  return cudaq::compiler_artifact::getArtifactJit(name);
 }
 
 /// In a sample launch context, the (`JIT` compiled) execution engine may be
@@ -336,33 +317,38 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
       throw std::runtime_error("no kernel named " + name + " found in module");
     Type resultTy = cudaq::runtime::getReturnType(funcOp);
 
-    std::unordered_set<unsigned> varArgIndices;
-    {
-      auto mangledNameMap = module->getAttrOfType<mlir::DictionaryAttr>(
-          cudaq::runtime::mangledNameMap);
-      bool parametricCompatible = false;
-      if (mangledNameMap)
-        if (auto attr = mangledNameMap.getAs<mlir::StringAttr>(fullName)) {
-          mlir::StringRef mn = attr.getValue();
-          parametricCompatible = mn != "BuilderKernel.EntryPoint" &&
-                                 !mn.contains("PyKernelFakeEntryPoint");
-        }
-      if (parametricCompatible)
-        for (auto [idx, argTy] :
-             llvm::enumerate(funcOp.getFunctionType().getInputs()))
-          if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(argTy))
-            if (isa<mlir::FloatType>(vecTy.getElementType()))
-              varArgIndices.insert(idx);
-    }
-    {
-      auto *execCtx = cudaq::getExecutionContext();
-      if (!execCtx || !execCtx->useParametricJit)
-        varArgIndices.clear();
-    }
-    const bool isFullySpecialized = varArgIndices.empty();
+    const bool hasResult = !!resultTy;
     auto resultInfo =
         CompiledModuleHelper::createResultInfo(resultTy, isEntryPoint, module);
 
+    // Determine whether the kernel needs argument packing (argsCreator) by
+    // checking if any non-callable arguments are present. This must be done
+    // before the cache lookup so the cached path uses the correct value.
+    bool isFullySpecialized = true;
+    FunctionType fromFuncTy = funcOp.getFunctionType();
+    // Specialization for direct calls will take care of partial specialization
+    // separately
+    bool isLocalSimulator =
+        !(cudaq::is_remote_platform() || cudaq::is_emulated_platform());
+
+    std::vector<void *> closureArgs;
+
+    // Special handling in case the arguments were already synthesized
+    size_t numArgs = rawArgs.size() - (hasResult ? 1 : 0);
+    if (isEntryPoint && isLocalSimulator &&
+        numArgs == fromFuncTy.getNumInputs()) {
+      closureArgs = rawArgs;
+      for (auto [i, ty] : llvm::enumerate(fromFuncTy.getInputs())) {
+        if (!isa<cudaq::cc::CallableType>(ty)) {
+          isFullySpecialized = false;
+          closureArgs[i] = nullptr;
+        }
+      }
+    } else {
+      // Avoid copying
+      closureArgs = std::move(rawArgs);
+    }
+
     if (auto jit = alreadyBuiltJITCode(name, rawArgs)) {
       auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
           name, *jit, resultInfo, isFullySpecialized);
@@ -389,8 +375,9 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     CUDAQ_INFO("Run Argument Synth.\n");
     if (enablePythonCodegenDump)
       module.dump();
-    specializeKernel(name, module, rawArgs, resultTy, enablePythonCodegenDump,
-                     isEntryPoint, varArgIndices);
+
+    specializeKernel(name, module, closureArgs, resultTy,
+                     enablePythonCodegenDump, isEntryPoint, isFullySpecialized);
 
     // 3b. Run target-specific passes if configured.
     runTargetPassPipeline(module);
@@ -401,11 +388,7 @@ struct PythonLauncher : public cudaq::ModuleLauncher {
     // 4. Lower to QIR and JIT compile.
     auto jit = createJITEngine(module, "qir:");
     cacheJITForPerformance(jit);
-    auto argsCreatorThunk = [&jit, &name]() {
-      return (void *)jit.lookupRawNameOrFail(name + ".argsCreator");
-    };
-    cudaq::compiler_artifact::saveArtifact(name, rawArgs, jit,
-                                           argsCreatorThunk);
+    cudaq::compiler_artifact::saveArtifact(name, jit);
 
     auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
         name, jit, resultInfo, isFullySpecialized);
diff --git a/runtime/internal/compiler/CompiledModuleHelper.cpp b/runtime/internal/compiler/CompiledModuleHelper.cpp
index ec3a227aa4a..3f1265f6979 100644
--- a/runtime/internal/compiler/CompiledModuleHelper.cpp
+++ b/runtime/internal/compiler/CompiledModuleHelper.cpp
@@ -42,14 +42,20 @@ CompiledModuleHelper::createJitArtifacts(const std::string &kernelName,
       (hasResult || !isFullySpecialized) ? kernelName + ".thunk" : fullName;
   void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
   int64_t (*argsCreator)(const void *, void **) = nullptr;
-  if (!isFullySpecialized)
+  int64_t (*returnOffset)() = nullptr;
+  if (!isFullySpecialized) {
     argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
         engine.lookupRawNameOrFail(kernelName + ".argsCreator"));
+    if (hasResult)
+      returnOffset = reinterpret_cast<int64_t (*)()>(
+          engine.lookupRawNameOrFail(kernelName + ".returnOffset"));
+  }
 
   std::vector<NamedJitArtifact> artifacts;
-  artifacts.emplace_back(kernelName, cudaq::CompiledModule::JitArtifact{
-                                         std::move(engine), entryPoint,
-                                         argsCreator, std::nullopt});
+  artifacts.emplace_back(kernelName,
+                         cudaq::CompiledModule::JitArtifact{
+                             std::move(engine), entryPoint, argsCreator,
+                             returnOffset, std::nullopt});
   return artifacts;
 }
 

From f6ba81084db4cdf7f651577c0b0b618a31dd9b80 Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Mon, 20 Apr 2026 11:51:35 +0000
Subject: [PATCH 24/85] fix: formatting issues, api key leak and default num of
 qubits

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 .github/workflows/integration_tests.yml       | 26 +++++-----
 .../using/backends/hardware/iontrap.rst       | 47 +++++++++----------
 .../helpers/qbraid/QbraidServerHelper.cpp     |  8 +++-
 3 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 508ed712532..32d938209fd 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -10,9 +10,9 @@ on:
   workflow_dispatch:
     inputs:
       target:
-        description: "Target (choose nightly to run like nightly tests)"
+        description: 'Target (choose nightly to run like nightly tests)'
         required: true
-        default: "nightly"
+        default: 'nightly'
         type: choice
         options:
           - nightly
@@ -32,38 +32,38 @@ on:
       single_test_name:
         type: string
         required: false
-        description: "Single test (e.g., targettests/quantinuum/load_value.cpp). Runs default tests if left blank"
+        description: 'Single test (e.g., targettests/quantinuum/load_value.cpp). Runs default tests if left blank'
       target_machine:
         type: string
         required: false
-        description: "Target machine (e.g., H2-1E)."
+        description: 'Target machine (e.g., H2-1E).'
       cudaq_test_image:
         type: string
         required: false
-        default: "" # picked up from repo variable if not provided
-        description: "CUDA Quantum image to run the tests in. Default to the latest CUDA Quantum nightly image"
+        default: '' # picked up from repo variable if not provided
+        description: 'CUDA Quantum image to run the tests in. Default to the latest CUDA Quantum nightly image'
       commit_sha:
         type: string
         required: false
-        description: "Commit SHA to pull the code (examples/tests) for testing. Default to the commit associated with the CUDA Quantum docker image if left blank"
+        description: 'Commit SHA to pull the code (examples/tests) for testing. Default to the commit associated with the CUDA Quantum docker image if left blank'
       workflow_id:
         type: string
         required: false
-        description: "Workflow Id to retrieve the Python wheel for testing. Default to the wheels produced by the Publishing workflow associated with the latest nightly CUDA Quantum Docker image if left blank"
+        description: 'Workflow Id to retrieve the Python wheel for testing. Default to the wheels produced by the Publishing workflow associated with the latest nightly CUDA Quantum Docker image if left blank'
       python_version:
         type: choice
         required: true
-        description: "Python version to run wheel test"
+        description: 'Python version to run wheel test'
         options:
-          - "3.11"
-          - "3.12"
-          - "3.13"
+        - '3.11'
+        - '3.12'
+        - '3.13'
 
   schedule:
     - cron: 0 3 * * *
 
 env:
-  python_version: "3.12"
+  python_version: '3.12'
 
 jobs:
   # Run a daily check of all links in the docs to find any newly broken links
diff --git a/docs/sphinx/using/backends/hardware/iontrap.rst b/docs/sphinx/using/backends/hardware/iontrap.rst
index 160ab9f549f..83e25326455 100644
--- a/docs/sphinx/using/backends/hardware/iontrap.rst
+++ b/docs/sphinx/using/backends/hardware/iontrap.rst
@@ -31,7 +31,7 @@ Submitting
 
     By default, quantum kernel code will be submitted to the IonQ simulator.
 
-    .. note::
+    .. note:: 
 
        A "target" in :code:`cudaq` refers to a quantum compute provider, such as :code:`ionq`.
        However, IonQ's documentation uses the term "target" to refer to specific QPU's themselves.
@@ -70,7 +70,7 @@ Submitting
 
         This will take the API key and handle all authentication with, and submission to, the IonQ QPU(s). By default, quantum kernel code will be submitted to the IonQsimulator.
 
-        .. note::
+        .. note:: 
 
                 A "target" in :code:`cudaq` refers to a quantum compute provider, such as :code:`ionq`.
                 However, IonQ's documentation uses the term "target" to refer to specific QPU's themselves.
@@ -105,7 +105,7 @@ Setting Credentials
 ```````````````````
 
 Programmers of CUDA-Q may access the Quantinuum API from either
-C++ or Python. Quantinuum requires a credential configuration file.
+C++ or Python. Quantinuum requires a credential configuration file. 
 The configuration file can be generated as follows, replacing
 the ``email`` and ``credentials`` in the first line with your Quantinuum
 account details.
@@ -134,8 +134,8 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
 .. tab:: Python
 
-
-        The backend to which quantum kernels are submitted
+       
+        The backend to which quantum kernels are submitted 
         can be controlled with the ``cudaq.set_target()`` function.
 
         .. code:: python
@@ -155,15 +155,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
             cudaq.set_target('quantinuum', machine='H2-2')
 
         where ``H2-2`` is an example of a physical QPU. Hardware specific
-        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For
-        access to the syntax checker for the provided machine, you may append an ``SC``
+        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For 
+        access to the syntax checker for the provided machine, you may append an ``SC`` 
         to the end (e.g, ``H2-1SC``).
 
-        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__
+        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__ 
         and navigate to the "Profile" tab, where you should find a table titled "Quantinuum Systems Access".
 
         To emulate the Quantinuum machine locally, without submitting through the cloud,
-        you can set the ``emulate`` flag to ``True``. This will emit any target
+        you can set the ``emulate`` flag to ``True``. This will emit any target 
         specific compiler warnings and diagnostics, before running a noise free emulation.
         You do not need to specify project or machine when emulating.
 
@@ -175,7 +175,7 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
         the ``shots_count`` argument to ``cudaq.sample`` or ``cudaq.observe``. By default,
         the ``shots_count`` is set to 1000.
 
-        .. code:: python
+        .. code:: python 
 
             cudaq.sample(kernel, shots_count=10000)
 
@@ -183,7 +183,7 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 .. tab:: C++
 
         To target quantum kernel code for execution in the Quantinuum backends,
-        pass the flag ``--target quantinuum`` to the ``nvq++`` compiler. CUDA-Q will
+        pass the flag ``--target quantinuum`` to the ``nvq++`` compiler. CUDA-Q will 
         authenticate via the Quantinuum REST API using the credential in your configuration file.
         By default, quantum kernel code will be submitted to the Quantinuum syntax checker.
         Submission to the syntax checker merely validates the program; the kernels are not executed.
@@ -202,15 +202,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
             nvq++ --target quantinuum --quantinuum-machine H2-2 src.cpp ...
 
         where ``H2-2`` is an example of a physical QPU. Hardware specific
-        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For
-        access to the syntax checker for the provided machine, you may append an ``SC``
+        emulators may be accessed by appending an ``E`` to the end (e.g, ``H2-2E``). For 
+        access to the syntax checker for the provided machine, you may append an ``SC`` 
         to the end (e.g, ``H2-1SC``).
 
-        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__
+        For a comprehensive list of available machines, login to your `Quantinuum Nexus user account <https://nexus.quantinuum.com/>`__ 
         and navigate to the "Profile" tab, where you should find a table titled "Quantinuum Systems Access".
 
         To emulate the Quantinuum machine locally, without submitting through the cloud,
-        you can pass the ``--emulate`` flag to ``nvq++``. This will emit any target
+        you can pass the ``--emulate`` flag to ``nvq++``. This will emit any target 
         specific compiler warnings and diagnostics, before running a noise free emulation.
         You do not need to specify project or machine when emulating.
 
@@ -218,15 +218,15 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
             nvq++ --emulate --target quantinuum src.cpp
 
-.. note::
+.. note:: 
 
        Quantinuum's syntax checker for Helios (e.g., ``Helios-1SC``) only performs QIR code validation and does not return any results.
        Thus, it always returns an empty result set. This is different from other Quantinuum backends (e.g., ``H2-1SC``) where the syntax checker returns dummy results.
        As a result, when using the Helios syntax checker, we may receive this warning message:
 
         .. code:: text
-
-                WARNING: this kernel invocation produced 0 shots worth of results when executed.
+    
+                WARNING: this kernel invocation produced 0 shots worth of results when executed. 
 
         It means that the kernel was successfully validated, but no execution results are available.
         To get results, please submit to the Helios emulator (e.g., ``Helios-1E``) or the actual quantum device (e.g., ``Helios-1``).
@@ -235,13 +235,12 @@ Create a project in the Nexus portal. You can find the project ID in the URL of
 
 To see a complete example, take a look at :ref:`Quantinuum examples <quantinuum-examples>`.
 
-.. note::
+.. note:: 
 
         In local emulation mode (``emulate`` flag set to ``True``), the program will be executed on the :ref:`default simulator <default-simulator>`.
-        The environment variable ``CUDAQ_DEFAULT_SIMULATOR`` can be used to change the emulation simulator.
-
+        The environment variable ``CUDAQ_DEFAULT_SIMULATOR`` can be used to change the emulation simulator. 
+        
         For example, the simulation floating point accuracy and/or the simulation capabilities (e.g., maximum number of qubits, supported quantum gates),
-        depend on the selected simulator.
-
+        depend on the selected simulator.  
+        
         Any environment variables must be set prior to setting the target or running "`import cudaq`".
-
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index 1b137c03bed..75adbb12594 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -20,7 +20,7 @@ namespace cudaq {
 class QbraidServerHelper : public ServerHelper {
   static constexpr const char *DEFAULT_URL = "https://api-v2.qbraid.com/api/v1";
   static constexpr const char *DEFAULT_DEVICE = "qbraid:qbraid:sim:qir-sv";
-  static constexpr int DEFAULT_QUBITS = 29;
+  static constexpr int DEFAULT_QUBITS = 30;
 
 public:
   const std::string name() const override { return "qbraid"; }
@@ -63,7 +63,11 @@ class QbraidServerHelper : public ServerHelper {
 
     cudaq::info("qBraid configuration initialized:");
     for (const auto &[key, value] : backendConfig) {
-      cudaq::info("  {} = {}", key, value);
+      if (key == "api_key") {
+        cudaq::info("  api_key = <redacted, {} chars>", value.size());
+      } else {
+        cudaq::info("  {} = {}", key, value);
+      }
     }
   }
 

From f4907ac231e02f2fb991f9decbd75118131ee89a Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Mon, 20 Apr 2026 12:03:51 +0000
Subject: [PATCH 25/85] fix: headers to use 2026

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 .../cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt   | 2 +-
 runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml   | 2 +-
 targettests/qbraid/bug_qubit.cpp                                | 2 +-
 targettests/qbraid/callable_kernel_arg.cpp                      | 2 +-
 targettests/qbraid/cudaq_observe.cpp                            | 2 +-
 targettests/qbraid/if_jit.cpp                                   | 2 +-
 targettests/qbraid/load_value.cpp                               | 2 +-
 targettests/qbraid/sudoku_2x2-1.cpp                             | 2 +-
 targettests/qbraid/sudoku_2x2-bit_names.cpp                     | 2 +-
 targettests/qbraid/sudoku_2x2-reg_name.cpp                      | 2 +-
 targettests/qbraid/sudoku_2x2.cpp                               | 2 +-
 targettests/qbraid/swap_gate.cpp                                | 2 +-
 targettests/qbraid/test-int8_t.cpp                              | 2 +-
 targettests/qbraid/test-int8_t_free_func.cpp                    | 2 +-
 targettests/qbraid/variable_size_qreg.cpp                       | 2 +-
 unittests/backends/qbraid/CMakeLists.txt                        | 2 +-
 unittests/backends/qbraid/QbraidStartServerAndTest.sh.in        | 2 +-
 unittests/backends/qbraid/QbraidTester.cpp                      | 2 +-
 utils/mock_qpu/qbraid/__init__.py                               | 2 +-
 19 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
index 823c01fd100..dac742b6824 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ============================================================================ #
-# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
 # All rights reserved.                                                         #
 #                                                                              #
 # This source code and the accompanying materials are made available under     #
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
index 0ee345afd43..da3f92dc94d 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
@@ -1,5 +1,5 @@
 # ============================================================================ #
-# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
 # All rights reserved.                                                         #
 #                                                                              #
 # This source code and the accompanying materials are made available under     #
diff --git a/targettests/qbraid/bug_qubit.cpp b/targettests/qbraid/bug_qubit.cpp
index 2179c9f4da1..2f53f71dd06 100644
--- a/targettests/qbraid/bug_qubit.cpp
+++ b/targettests/qbraid/bug_qubit.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/callable_kernel_arg.cpp b/targettests/qbraid/callable_kernel_arg.cpp
index 759469537e7..afa5fd8d960 100644
--- a/targettests/qbraid/callable_kernel_arg.cpp
+++ b/targettests/qbraid/callable_kernel_arg.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/cudaq_observe.cpp b/targettests/qbraid/cudaq_observe.cpp
index d9d1c537d85..0415480c32e 100644
--- a/targettests/qbraid/cudaq_observe.cpp
+++ b/targettests/qbraid/cudaq_observe.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/if_jit.cpp b/targettests/qbraid/if_jit.cpp
index 5719dc5b770..2b763a955ff 100644
--- a/targettests/qbraid/if_jit.cpp
+++ b/targettests/qbraid/if_jit.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/load_value.cpp b/targettests/qbraid/load_value.cpp
index ab5d9cec62e..6ed611e0685 100644
--- a/targettests/qbraid/load_value.cpp
+++ b/targettests/qbraid/load_value.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/sudoku_2x2-1.cpp b/targettests/qbraid/sudoku_2x2-1.cpp
index cd028025a0c..952d93495f6 100644
--- a/targettests/qbraid/sudoku_2x2-1.cpp
+++ b/targettests/qbraid/sudoku_2x2-1.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/sudoku_2x2-bit_names.cpp b/targettests/qbraid/sudoku_2x2-bit_names.cpp
index ef53021b359..e0de3cdd4da 100644
--- a/targettests/qbraid/sudoku_2x2-bit_names.cpp
+++ b/targettests/qbraid/sudoku_2x2-bit_names.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/sudoku_2x2-reg_name.cpp b/targettests/qbraid/sudoku_2x2-reg_name.cpp
index 6200c1070f7..eb42663a5bc 100644
--- a/targettests/qbraid/sudoku_2x2-reg_name.cpp
+++ b/targettests/qbraid/sudoku_2x2-reg_name.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/sudoku_2x2.cpp b/targettests/qbraid/sudoku_2x2.cpp
index e3d4bc2c0c3..a5cb54c0843 100644
--- a/targettests/qbraid/sudoku_2x2.cpp
+++ b/targettests/qbraid/sudoku_2x2.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/swap_gate.cpp b/targettests/qbraid/swap_gate.cpp
index 4f37edae871..7331ecbd262 100644
--- a/targettests/qbraid/swap_gate.cpp
+++ b/targettests/qbraid/swap_gate.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/test-int8_t.cpp b/targettests/qbraid/test-int8_t.cpp
index 7178f6c57bb..7f3cf8b63c3 100644
--- a/targettests/qbraid/test-int8_t.cpp
+++ b/targettests/qbraid/test-int8_t.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/test-int8_t_free_func.cpp b/targettests/qbraid/test-int8_t_free_func.cpp
index ca9db25ec6c..6c3eea548d8 100644
--- a/targettests/qbraid/test-int8_t_free_func.cpp
+++ b/targettests/qbraid/test-int8_t_free_func.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/targettests/qbraid/variable_size_qreg.cpp b/targettests/qbraid/variable_size_qreg.cpp
index 1f6c139a085..120fcede939 100644
--- a/targettests/qbraid/variable_size_qreg.cpp
+++ b/targettests/qbraid/variable_size_qreg.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/unittests/backends/qbraid/CMakeLists.txt b/unittests/backends/qbraid/CMakeLists.txt
index 390d20cc896..e088f6546de 100644
--- a/unittests/backends/qbraid/CMakeLists.txt
+++ b/unittests/backends/qbraid/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ============================================================================ #
-# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
 # All rights reserved.                                                         #
 #                                                                              #
 # This source code and the accompanying materials are made available under     #
diff --git a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
index bd5c15b9af7..3510a3077f1 100644
--- a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
+++ b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # ============================================================================ #
-# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
 # All rights reserved.                                                         #
 #                                                                              #
 # This source code and the accompanying materials are made available under     #
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index 7580ab62750..37e99325139 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
index a98dabd9b10..20a559fff7c 100644
--- a/utils/mock_qpu/qbraid/__init__.py
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -1,5 +1,5 @@
 # ============================================================================ #
-# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
 # All rights reserved.                                                         #
 #                                                                              #
 # This source code and the accompanying materials are made available under     #

From e412f672acfe6e7ee8dd3e1c08b738d97274ac09 Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Mon, 20 Apr 2026 15:06:49 +0200
Subject: [PATCH 26/85] Split lowerQuakeCode into compilation -> CompiledModule
 -> codegen (#4340)

This PR breaks up `Compiler::lowerQuakeCode` into two steps:

1. a compilation step called `Compiler::runPassPipeline` that takes the
MLIR `ModuleOp` and generates a `CompiledModule` instance. This contains
either MLIR or JIT code, depending on whether the kernel will be run
locally or must be submitted in some wire format for remote execution
2. a codegen step called `Compiler::emitKernelExecutions` that takes the
`CompiledModule` and returns a vector of `KernelExecutions`, ready to be
executed or submitted.

This PR is a no-op, in the sense that all current behaviour should be
preserved. It paves the way to splitting the compilation from
codegen+execution within the QPU classes.

After lots of tries, I'm quite happy with how the PR turned out. I
needed to add a `CompilationMetadata` struct to `CompiledModule`, which
seems reasonable to me. I expect that we will add more data to it in the
future, if only to verify at kernel execution time that the launch
configuration matches the target it was compiled for.

The one thing I'm not happy about right now is how `ResourceCounts` is
stored within the `JitArtifact`. I'll make another PR that addresses
that.

---------

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 runtime/common/CompiledModule.cpp             |  5 +
 runtime/common/CompiledModule.h               | 31 ++++--
 .../nlopt/nlopt-src/src/algs/stogo/global.h   |  2 +-
 .../compiler/CompiledModuleHelper.cpp         | 39 ++++++--
 runtime/internal/compiler/Compiler.cpp        | 94 +++++++++++++------
 .../compiler/CompiledModuleHelper.h           | 48 +++++++---
 .../cudaq_internal/compiler/Compiler.h        | 37 +++++---
 7 files changed, 179 insertions(+), 77 deletions(-)

diff --git a/runtime/common/CompiledModule.cpp b/runtime/common/CompiledModule.cpp
index 83b45c8d197..958ec55be4f 100644
--- a/runtime/common/CompiledModule.cpp
+++ b/runtime/common/CompiledModule.cpp
@@ -111,3 +111,8 @@ void (*cudaq::CompiledModule::JitArtifact::getEntryPoint() const)() {
 cudaq::JitEngine cudaq::CompiledModule::JitArtifact::getEngine() const {
   return engine;
 }
+
+std::optional<cudaq::Resources>
+cudaq::CompiledModule::JitArtifact::getResourceCounts() const {
+  return resourceCounts;
+}
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index 430e4d58f86..f0dd49b3bfd 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -27,6 +27,7 @@
 
 namespace mlir {
 class ExecutionEngine;
+class MLIRContext;
 } // namespace mlir
 
 namespace cudaq_internal::compiler {
@@ -142,27 +143,39 @@ class CompiledModule {
     /// as it will handle the buffer and argument packing automatically.
     void (*getEntryPoint() const)();
     JitEngine getEngine() const;
+
+    std::optional<Resources> getResourceCounts() const;
   };
 
   /// Optimized MLIR module artifact, for deferred code generation or
   /// re-targeting.
   /// Type-erased to keep this header MLIR-free.
   class MlirArtifact {
-    /// Opaque ModuleOp pointer (via `ModuleOp::getAsOpaquePointer()`).
-    ///
-    /// Lifetime: the caller must ensure that the `MLIRContext` that owns
-    /// this ModuleOp outlives this object.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wattributes"
-    [[maybe_unused]] const void *modulePtr = nullptr;
-#pragma GCC diagnostic pop
+    /// Opaque ModuleOp pointer (via `module.getAsOpaquePointer()`).
+    const void *modulePtr = nullptr;
+
+    /// Optional owning reference to the containing `MLIRContext`.
+    std::shared_ptr<mlir::MLIRContext> context;
+
+    MlirArtifact(const void *modulePtr,
+                 std::shared_ptr<mlir::MLIRContext> context)
+        : modulePtr(modulePtr), context(std::move(context)) {}
 
     friend class CompiledModule;
+    friend class cudaq_internal::compiler::CompiledModuleHelper;
   };
 
   /// A compiled artifact is either a JIT binary or an MLIR module.
   using CompiledArtifact = std::variant<JitArtifact, MlirArtifact>;
 
+  // --- Compilation metadata ---
+
+  /// Metadata on the compilation artifacts.
+  struct CompilationMetadata {
+    /// Qubit reorder indices emitted by the qubit-mapping pass.
+    std::vector<std::size_t> reorderIdx;
+  };
+
   // --- Queries ---
 
   /// Whether any artifact in the map is a JitArtifact.
@@ -193,6 +206,7 @@ class CompiledModule {
 
   const std::string &getName() const { return name; }
   const ResultInfo &getResultInfo() const { return resultInfo; }
+  const CompilationMetadata &getMetadata() const { return metadata; }
 
   // --- Execution (local JIT path) ---
 
@@ -218,6 +232,7 @@ class CompiledModule {
   ResultInfo resultInfo; // TODO: we might want to store the entire kernel
                          // signature here. Though I'm not sure what MLIR
                          // agnostic information is worth storing.
+  CompilationMetadata metadata;
   std::map<std::string, CompiledArtifact> artifacts;
 };
 
diff --git a/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h b/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h
index aa44bc77aa3..e2cb36ee650 100644
--- a/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h
+++ b/runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src/src/algs/stogo/global.h
@@ -51,7 +51,7 @@ class Global : public GlobalParams {
 
   Global(RTBox, Pobj, Pgrad, GlobalParams);
 
-  virtual ~Global() {};
+  virtual ~Global(){};
 
   //  Global& operator=(const Global &);
 
diff --git a/runtime/internal/compiler/CompiledModuleHelper.cpp b/runtime/internal/compiler/CompiledModuleHelper.cpp
index 3f1265f6979..2e36d74cd1a 100644
--- a/runtime/internal/compiler/CompiledModuleHelper.cpp
+++ b/runtime/internal/compiler/CompiledModuleHelper.cpp
@@ -10,6 +10,8 @@
 #include "cudaq/Optimizer/Builder/RuntimeNames.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/Types.h"
 
 using namespace mlir;
@@ -31,10 +33,10 @@ cudaq::ResultInfo CompiledModuleHelper::createResultInfo(Type resultTy,
 }
 
 std::vector<CompiledModuleHelper::NamedJitArtifact>
-CompiledModuleHelper::createJitArtifacts(const std::string &kernelName,
-                                         cudaq::JitEngine engine,
-                                         const cudaq::ResultInfo &resultInfo,
-                                         bool isFullySpecialized) {
+CompiledModuleHelper::createJitArtifacts(
+    const std::string &kernelName, cudaq::JitEngine engine,
+    const cudaq::ResultInfo &resultInfo, bool isFullySpecialized,
+    std::optional<cudaq::Resources> resourceCounts) {
   bool hasResult = resultInfo.hasResult();
   std::string fullName =
       std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName;
@@ -55,30 +57,47 @@ CompiledModuleHelper::createJitArtifacts(const std::string &kernelName,
   artifacts.emplace_back(kernelName,
                          cudaq::CompiledModule::JitArtifact{
                              std::move(engine), entryPoint, argsCreator,
-                             returnOffset, std::nullopt});
+                             returnOffset, std::move(resourceCounts)});
   return artifacts;
 }
 
+CompiledModuleHelper::NamedMlirArtifact
+CompiledModuleHelper::createMlirArtifact(std::string name, ModuleOp module,
+                                         std::shared_ptr<MLIRContext> context) {
+  const void *ptr = module.getAsOpaquePointer();
+  return {std::move(name),
+          cudaq::CompiledModule::MlirArtifact{ptr, std::move(context)}};
+}
+
+ModuleOp CompiledModuleHelper::getMlirModuleOp(
+    const cudaq::CompiledModule::MlirArtifact &artifact) {
+  return ModuleOp::getFromOpaquePointer(artifact.modulePtr);
+}
+
 cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
     std::string name, cudaq::ResultInfo resultInfo,
-    std::vector<NamedJitArtifact> jitArtifacts) {
+    std::vector<NamedJitArtifact> jitArtifacts,
+    cudaq::CompiledModule::CompilationMetadata metadata) {
   return createCompiledModule(std::move(name), std::move(resultInfo),
-                              std::move(jitArtifacts), {});
+                              std::move(jitArtifacts), {}, std::move(metadata));
 }
 
 cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
     std::string name, cudaq::ResultInfo resultInfo,
-    std::vector<NamedMlirArtifact> mlirArtifacts) {
+    std::vector<NamedMlirArtifact> mlirArtifacts,
+    cudaq::CompiledModule::CompilationMetadata metadata) {
   return createCompiledModule(std::move(name), std::move(resultInfo), {},
-                              std::move(mlirArtifacts));
+                              std::move(mlirArtifacts), std::move(metadata));
 }
 
 cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
     std::string name, cudaq::ResultInfo resultInfo,
     std::vector<NamedJitArtifact> jitArtifacts,
-    std::vector<NamedMlirArtifact> mlirArtifacts) {
+    std::vector<NamedMlirArtifact> mlirArtifacts,
+    cudaq::CompiledModule::CompilationMetadata metadata) {
   cudaq::CompiledModule compiled(std::move(name));
   compiled.resultInfo = std::move(resultInfo);
+  compiled.metadata = std::move(metadata);
   for (auto &[artName, artifact] : jitArtifacts)
     compiled.addArtifact(std::move(artName), std::move(artifact));
   for (auto &[artName, artifact] : mlirArtifacts)
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index ee76e986be5..d58d78bede3 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -92,9 +92,8 @@ nlohmann::json formOutputNames(const std::string &codegenTranslation,
 }
 } // namespace
 
-std::tuple<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>, void *>
-Compiler::extractQuakeCodeAndContext(const std::string &kernelName,
-                                     void *data) {
+std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
+Compiler::extractQuakeCodeAndContext(const std::string &kernelName) {
   auto context = getOwningMLIRContext();
 
   // Get the quake representation of the kernel
@@ -103,7 +102,7 @@ Compiler::extractQuakeCodeAndContext(const std::string &kernelName,
   if (!m_module)
     throw std::runtime_error("module cannot be parsed");
 
-  return std::make_tuple(m_module.release(), std::move(context), data);
+  return std::make_pair(m_module.release(), std::move(context));
 }
 
 Compiler::Compiler(cudaq::ServerHelper *serverHelper,
@@ -225,10 +224,13 @@ Compiler::Compiler(cudaq::ServerHelper *serverHelper,
 
 Compiler::~Compiler() = default;
 
-std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
+cudaq::CompiledModule Compiler::runPassPipeline(
     cudaq::ExecutionContext *executionContext, const std::string &kernelName,
-    void *kernelArgs, const std::vector<void *> &rawArgs,
-    mlir::ModuleOp m_module, mlir::MLIRContext *contextPtr, void *updatedArgs) {
+    mlir::ModuleOp m_module, const std::vector<void *> &rawArgs,
+    void *kernelArgs, std::shared_ptr<mlir::MLIRContext> context) {
+  auto contextPtr = m_module.getContext();
+  assert(!context || context.get() == contextPtr);
+
   // Extract the kernel name
   auto origFn = m_module.template lookupSymbol<mlir::func::FuncOp>(
       std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
@@ -258,7 +260,7 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
   auto epFunc =
       moduleOp.template lookupSymbol<mlir::func::FuncOp>(origFn.getName());
   const bool isPython = moduleOp->hasAttr(cudaq::runtime::pythonUniqueAttrName);
-  if (!rawArgs.empty() || updatedArgs) {
+  if (!rawArgs.empty() || kernelArgs) {
     mlir::PassManager pm(contextPtr);
     if (isPython)
       mergeAllCallableClosures(moduleOp, kernelName, rawArgs);
@@ -312,9 +314,9 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
           cudaq::opt::createReplaceStateWithKernel());
       cudaq::opt::addAggressiveInlining(pm);
       pm.addPass(mlir::createSymbolDCEPass());
-    } else if (updatedArgs) {
+    } else if (kernelArgs) {
       CUDAQ_INFO("Run Quake Synth.\n");
-      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, updatedArgs));
+      pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, kernelArgs));
     }
     pm.addPass(mlir::createCanonicalizerPass());
     if (disableMLIRthreading || enablePrintMLIREachPass)
@@ -466,15 +468,19 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
     modules.emplace_back(kernelName, moduleOp);
   }
 
+  // For emulation or resource counting: create JIT artifacts before
+  // applying combine-measurements (so the JIT sees un-combined measurements).
+  std::vector<CompiledModuleHelper::NamedJitArtifact> jitArtifacts;
   if (emulate ||
       (executionContext && executionContext->name == "resource-count")) {
-    // If we are in emulation mode, we need to first get a full QIR
-    // representation of the code. Then we'll map to an LLVM Module, create a
-    // JIT ExecutionEngine pointer and use that for execution
     for (auto &[name, module] : modules) {
       auto clonedModule = module.clone();
-      jitEngines.emplace_back(
-          createJITEngine(clonedModule, codegenTranslation));
+      auto artifacts = CompiledModuleHelper::createJitArtifacts(
+          kernelName, createJITEngine(clonedModule, codegenTranslation), {},
+          /*isFullySpecialized=*/true, std::move(resourceCounts));
+      assert(artifacts.size() == 1);
+      artifacts[0].first = name;
+      jitArtifacts.push_back(std::move(artifacts[0]));
     }
   }
 
@@ -482,13 +488,32 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
     for (auto &[name, module] : modules)
       runPassPipeline("func.func(combine-measurements)", module);
 
+  std::vector<CompiledModuleHelper::NamedMlirArtifact> mlirArtifacts;
+  for (auto &[name, module] : modules) {
+    auto mlirName = name + ".mlir"; // distinguish MLIR and JIT artifacts
+    mlirArtifacts.push_back(
+        CompiledModuleHelper::createMlirArtifact(mlirName, module, context));
+  }
+
+  return CompiledModuleHelper::createCompiledModule(
+      kernelName, {}, std::move(jitArtifacts), std::move(mlirArtifacts),
+      {.reorderIdx = mapping_reorder_idx});
+}
+
+std::vector<cudaq::KernelExecution>
+Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
   // Get the code gen translation
   auto translation = getTranslation(codegenTranslation);
 
   // Apply user-specified codegen
   std::vector<cudaq::KernelExecution> codes;
-  for (auto iter : llvm::enumerate(modules)) {
-    auto &[name, moduleOpI] = iter.value();
+  for (auto &[name, artifact] : compiled.getArtifacts()) {
+    if (!name.ends_with(".mlir"))
+      continue;
+    auto &mlirArtifact =
+        std::get<cudaq::CompiledModule::MlirArtifact>(artifact);
+    auto moduleOpI = CompiledModuleHelper::getMlirModuleOp(mlirArtifact);
+
     std::string codeStr;
     llvm::raw_string_ostream outStr(codeStr);
     if (disableMLIRthreading)
@@ -509,12 +534,21 @@ std::vector<cudaq::KernelExecution> Compiler::lowerQuakeCodePart2(
     // Form an output_names mapping from codeStr
     nlohmann::json j = formOutputNames(codegenTranslation, moduleOpI, codeStr);
 
-    auto optionalJit = jitEngines.size() > iter.index()
-                           ? std::optional(jitEngines[iter.index()])
-                           : std::nullopt;
-    auto optionalResourceCounts = resourceCounts;
-    codes.emplace_back(name, codeStr, optionalJit, optionalResourceCounts, j,
-                       mapping_reorder_idx);
+    // Retrieve pre-computed JIT engine and resource counts (if any).
+    std::optional<cudaq::JitEngine> optionalJit;
+    std::optional<cudaq::Resources> optionalResourceCounts;
+    auto kernelName = name.substr(0, name.length() - 5);
+    auto it = compiled.getArtifacts().find(kernelName);
+    if (it != compiled.getArtifacts().end()) {
+      const auto &jit =
+          std::get<cudaq::CompiledModule::JitArtifact>(it->second);
+      optionalJit = jit.getEngine();
+      optionalResourceCounts = jit.getResourceCounts();
+    }
+
+    auto mapping_reorder_idx = compiled.getMetadata().reorderIdx;
+    codes.emplace_back(kernelName, codeStr, optionalJit, optionalResourceCounts,
+                       j, mapping_reorder_idx);
   }
 
   return codes;
@@ -528,19 +562,19 @@ std::vector<cudaq::KernelExecution>
 Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                          const std::string &kernelName, void *kernelArgs,
                          const std::vector<void *> &rawArgs) {
-
-  auto [m_module, contextPtr, updatedArgs] =
-      extractQuakeCodeAndContext(kernelName, kernelArgs);
-  return lowerQuakeCodePart2(executionContext, kernelName, kernelArgs, rawArgs,
-                             m_module, contextPtr.get(), updatedArgs);
+  auto [m_module, context] = extractQuakeCodeAndContext(kernelName);
+  auto compiled = runPassPipeline(executionContext, kernelName, m_module,
+                                  rawArgs, kernelArgs, std::move(context));
+  return emitKernelExecutions(compiled);
 }
 
 std::vector<cudaq::KernelExecution>
 Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                          const std::string &kernelName, mlir::ModuleOp module,
                          const std::vector<void *> &rawArgs) {
-  return lowerQuakeCodePart2(executionContext, kernelName, nullptr, rawArgs,
-                             module, module.getContext(), nullptr);
+  auto compiled =
+      runPassPipeline(executionContext, kernelName, module, rawArgs);
+  return emitKernelExecutions(compiled);
 }
 
 mlir::ModuleOp Compiler::lowerQuakeCodeBuildModule(
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
index d979fcb0535..cfd5732061f 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
@@ -8,9 +8,11 @@
 #pragma once
 
 #include "common/CompiledModule.h"
+#include <memory>
 
 namespace mlir {
 class Type;
+class MLIRContext;
 class ModuleOp;
 } // namespace mlir
 
@@ -46,29 +48,45 @@ class CompiledModuleHelper {
   /// engine.
   ///
   /// Uses the kernel's name and result metadata to determine the correct
-  /// mangled symbol names. Returns one named artifact per resolved symbol.
-  static std::vector<NamedJitArtifact>
-  createJitArtifacts(const std::string &kernelName, cudaq::JitEngine engine,
-                     const cudaq::ResultInfo &resultInfo,
-                     bool isFullySpecialized);
+  /// mangled symbol names. Currently returns one artifact.
+  ///
+  /// Optionally, a \p resourceCounts can be attached to the returned artifact.
+  static std::vector<NamedJitArtifact> createJitArtifacts(
+      const std::string &kernelName, cudaq::JitEngine engine,
+      const cudaq::ResultInfo &resultInfo, bool isFullySpecialized,
+      std::optional<cudaq::Resources> resourceCounts = std::nullopt);
+
+  // --- MlirArtifact construction and access ---
+
+  /// Construct a named `MlirArtifact` from a `ModuleOp`.
+  static NamedMlirArtifact
+  createMlirArtifact(std::string name, mlir::ModuleOp module,
+                     std::shared_ptr<mlir::MLIRContext> context = nullptr);
+
+  /// Extract the `ModuleOp` from a `MlirArtifact`.
+  static mlir::ModuleOp
+  getMlirModuleOp(const cudaq::CompiledModule::MlirArtifact &artifact);
 
   // --- CompiledModule construction ---
 
   /// Create a `CompiledModule` containing only JIT artifacts.
-  static cudaq::CompiledModule
-  createCompiledModule(std::string name, cudaq::ResultInfo resultInfo,
-                       std::vector<NamedJitArtifact> jitArtifacts);
+  static cudaq::CompiledModule createCompiledModule(
+      std::string name, cudaq::ResultInfo resultInfo,
+      std::vector<NamedJitArtifact> jitArtifacts,
+      cudaq::CompiledModule::CompilationMetadata metadata = {});
 
   /// Create a `CompiledModule` containing only MLIR artifacts.
-  static cudaq::CompiledModule
-  createCompiledModule(std::string name, cudaq::ResultInfo resultInfo,
-                       std::vector<NamedMlirArtifact> mlirArtifacts);
+  static cudaq::CompiledModule createCompiledModule(
+      std::string name, cudaq::ResultInfo resultInfo,
+      std::vector<NamedMlirArtifact> mlirArtifacts,
+      cudaq::CompiledModule::CompilationMetadata metadata = {});
 
   /// Create a `CompiledModule` containing both JIT and MLIR artifacts.
-  static cudaq::CompiledModule
-  createCompiledModule(std::string name, cudaq::ResultInfo resultInfo,
-                       std::vector<NamedJitArtifact> jitArtifacts,
-                       std::vector<NamedMlirArtifact> mlirArtifacts);
+  static cudaq::CompiledModule createCompiledModule(
+      std::string name, cudaq::ResultInfo resultInfo,
+      std::vector<NamedJitArtifact> jitArtifacts,
+      std::vector<NamedMlirArtifact> mlirArtifacts,
+      cudaq::CompiledModule::CompilationMetadata metadata = {});
 };
 
 } // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
index 682cef3b3af..d2f3e12c2d8 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include "common/CompiledModule.h"
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include <map>
 #include <memory>
 #include <string>
@@ -62,25 +63,14 @@ class Compiler {
   /// to be printed. This is similar to `-mlir-pass-statistics` in `cudaq-opt`
   bool enablePassStatistics = false;
 
-  /// @brief If we are emulating locally, keep track
-  /// of JIT engines for invoking the kernels.
-  std::vector<cudaq::JitEngine> jitEngines;
-
   /// @brief Flag indicating whether we should emulate execution locally.
   bool emulate = false;
 
   /// @brief Flag indicating whether we should print the IR.
   bool printIR = false;
 
-  std::vector<cudaq::KernelExecution>
-  lowerQuakeCodePart2(cudaq::ExecutionContext *executionContext,
-                      const std::string &kernelName, void *kernelArgs,
-                      const std::vector<void *> &rawArgs,
-                      mlir::ModuleOp m_module, mlir::MLIRContext *contextPtr,
-                      void *updatedArgs);
-
-  std::tuple<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>, void *>
-  extractQuakeCodeAndContext(const std::string &kernelName, void *data);
+  std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
+  extractQuakeCodeAndContext(const std::string &kernelName);
 
   mlir::ModuleOp lowerQuakeCodeBuildModule(const std::string &,
                                            mlir::ModuleOp module,
@@ -94,6 +84,27 @@ class Compiler {
            const cudaq::noise_model *noiseModel, bool emulate);
   ~Compiler();
 
+  /// @brief Compile the given module and return a `CompiledModule`.
+  ///
+  /// Performs argument synthesis, the full pass pipeline, and observation
+  /// splitting (for observe mode).
+  ///
+  /// If \p context is provided, `module.getContext() == context.get()` must
+  /// be true. In that case, the MLIR artifacts will keep a `shared_ptr` to
+  /// the context, guaranteeing it outlives the artifacts. Otherwise the
+  /// context lifetime must be managed by the caller.
+  cudaq::CompiledModule
+  runPassPipeline(cudaq::ExecutionContext *executionContext,
+                  const std::string &kernelName, mlir::ModuleOp module,
+                  const std::vector<void *> &rawArgs,
+                  void *kernelArgs = nullptr,
+                  std::shared_ptr<mlir::MLIRContext> context = nullptr);
+
+  /// @brief Emit target-specific code for each `MlirArtifact` in the
+  /// `CompiledModule` and produce `KernelExecution` objects.
+  std::vector<cudaq::KernelExecution>
+  emitKernelExecutions(const cudaq::CompiledModule &compiled);
+
   /// @brief Extract the Quake representation for the given kernel name and
   /// lower it to the code format required for the specific backend. The
   /// lowering process is controllable via the configuration file in the

From 504828e9403b982bf529a1e6de7823dfa1ea81fe Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Mon, 20 Apr 2026 12:46:41 -0700
Subject: [PATCH 27/85] Fix for cudaq runtime build (#4353)

Wrapping each entry of CUDAToolkit_INCLUDE_DIRS in its own
$<BUILD_INTERFACE:...> generator expression.

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 runtime/cudaq/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index a6042f12f46..77bfd68e78f 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -44,11 +44,15 @@ add_library(${LIBRARY_NAME}
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS ${LIBRARY_NAME})
 
 if (CUDA_FOUND)
+  set(_cudaq_cuda_build_includes "")
+  foreach(_cuda_inc IN LISTS CUDAToolkit_INCLUDE_DIRS)
+    list(APPEND _cudaq_cuda_build_includes "$<BUILD_INTERFACE:${_cuda_inc}>")
+  endforeach()
   target_include_directories(${LIBRARY_NAME}
     PUBLIC $<INSTALL_INTERFACE:include>
             $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>
             $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/tpls/eigen>
-            $<BUILD_INTERFACE:${CUDAToolkit_INCLUDE_DIRS}>
+            ${_cudaq_cuda_build_includes}
     PRIVATE . ptsbe ptsbe/strategies)
 
   target_link_libraries(${LIBRARY_NAME}

From 64bb8edc5a4c939031ae9d1e73e0f94e497eafdd Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Mon, 20 Apr 2026 14:21:32 -0700
Subject: [PATCH 28/85] add nv-slack-bot and stale issue notifier (#4345)

Signed-off-by: mitchdz <mitch_dz@hotmail.com>
---
 .github/nv-slack-bot.yaml                  | 100 +++++++++++++++++++
 .github/workflows/stale_issue_notifier.yml | 107 +++++++++++++++++++++
 2 files changed, 207 insertions(+)
 create mode 100644 .github/nv-slack-bot.yaml
 create mode 100644 .github/workflows/stale_issue_notifier.yml

diff --git a/.github/nv-slack-bot.yaml b/.github/nv-slack-bot.yaml
new file mode 100644
index 00000000000..27bc9e65782
--- /dev/null
+++ b/.github/nv-slack-bot.yaml
@@ -0,0 +1,100 @@
+$schema: https://public.gha-runners.nvidia.com/nv-slack-bot/schemas/config-v1.json
+enabled: true
+notifications:
+  - name: "Publishing workflow failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "Publishing" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Publishing workflow> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Stable publishing workflow failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "Push stable release" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Push stable release workflow> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Deployments workflow failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "Deployments" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Deployments workflow> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Merge queue CI failed"
+    event: workflow_run
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: workflow_run.name = "CI" and workflow_run.event = "merge_group" and workflow_run.conclusion = "failure"
+    message:
+      body: |
+        <{{url}}|Merge queue CI> failed on `{{branch}}` (commit <{{commitUrl}}|{{sha}}>)
+      vars:
+        url: workflow_run.html_url
+        branch: workflow_run.head_branch
+        sha: $substring(workflow_run.head_sha, 0, 7)
+        commitUrl: workflow_run.head_repository.html_url & "/commit/" & workflow_run.head_sha
+    show_webhook_payload_on_error: false
+
+  - name: "Issue opened"
+    event: issues
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: action = "opened"
+    message:
+      body: |
+        New issue opened: <{{url}}|{{title}}>
+      vars:
+        url: issue.html_url
+        title: issue.title
+    show_webhook_payload_on_error: false
+
+  - name: "Stale issue (no update in over a week)"
+    event: repository_dispatch
+    slack:
+      nvidia:
+        channels:
+          - id: C0AT93CK1B9 # nvqpp-cudaq-notifications
+    match: action = "stale-issue"
+    message:
+      body: |
+        Issue has had no activity in over a week: <{{url}}|{{title}}>
+      vars:
+        url: client_payload.url
+        title: client_payload.title
+    show_webhook_payload_on_error: false
diff --git a/.github/workflows/stale_issue_notifier.yml b/.github/workflows/stale_issue_notifier.yml
new file mode 100644
index 00000000000..fefdacba09f
--- /dev/null
+++ b/.github/workflows/stale_issue_notifier.yml
@@ -0,0 +1,107 @@
+name: Stale issue notifier
+
+on:
+  schedule:
+    - cron: '0 14 * * *'
+  workflow_dispatch:
+  issues:
+    types: [edited, reopened]
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  issues: write
+
+jobs:
+  notify:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dispatch stale-issue events
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const WEEK_MS = 7 * 24 * 60 * 60 * 1000;
+            const LABEL = 'stale-notified';
+            const now = Date.now();
+
+            try {
+              await github.rest.issues.getLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                name: LABEL,
+              });
+            } catch (e) {
+              if (e.status !== 404) throw e;
+              await github.rest.issues.createLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                name: LABEL,
+                color: 'ededed',
+                description: 'Stale notification has already fired for this issue',
+              });
+            }
+
+            const issues = await github.paginate(github.rest.issues.listForRepo, {
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              state: 'open',
+              sort: 'updated',
+              direction: 'asc',
+              per_page: 100,
+            });
+
+            let dispatched = 0;
+            for (const issue of issues) {
+              if (issue.pull_request) continue;
+              const age = now - new Date(issue.updated_at).getTime();
+              const alreadyNotified = issue.labels.some(l => (l.name || l) === LABEL);
+              if (age < WEEK_MS || alreadyNotified) continue;
+
+              await github.rest.repos.createDispatchEvent({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                event_type: 'stale-issue',
+                client_payload: {
+                  number: issue.number,
+                  url: issue.html_url,
+                  title: issue.title,
+                },
+              });
+              await github.rest.issues.addLabels({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issue.number,
+                labels: [LABEL],
+              });
+              core.info(`Dispatched stale-issue for #${issue.number}`);
+              dispatched++;
+            }
+            core.info(`Total dispatched: ${dispatched}`);
+
+  refresh:
+    if: |
+      github.event_name == 'issues' ||
+      (github.event_name == 'issue_comment' && github.event.issue.pull_request == null)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remove stale-notified label on activity
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const LABEL = 'stale-notified';
+            const issue = context.payload.issue;
+            const hasLabel = (issue.labels || []).some(l => l.name === LABEL);
+            if (!hasLabel) return;
+            try {
+              await github.rest.issues.removeLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issue.number,
+                name: LABEL,
+              });
+              core.info(`Removed ${LABEL} from #${issue.number}`);
+            } catch (e) {
+              if (e.status !== 404) throw e;
+            }

From a79cc61bc596b940b8bffe7f10cf5f6e448ab88b Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Tue, 21 Apr 2026 06:47:12 +0000
Subject: [PATCH 29/85] fix: cmake list for qbraid to unconditionally include
 qbraid in the build and skip remote tests for qbraid

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt | 4 +---
 scripts/validate_pycudaq.sh                                | 3 ++-
 unittests/backends/CMakeLists.txt                          | 4 +---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
index 4574b6ba8fe..55fff380909 100644
--- a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
@@ -27,6 +27,4 @@ endif()
 if(CUDAQ_ENABLE_TII_BACKEND)
   add_subdirectory(tii)
 endif()
-if(CUDAQ_ENABLE_QBRAID_BACKEND)
-  add_subdirectory(qbraid)
-endif()
+add_subdirectory(qbraid)
diff --git a/scripts/validate_pycudaq.sh b/scripts/validate_pycudaq.sh
index 738e3b4da46..2197dd05b81 100644
--- a/scripts/validate_pycudaq.sh
+++ b/scripts/validate_pycudaq.sh
@@ -472,7 +472,8 @@ if [ -d "$root_folder/targets" ]; then
                 skip_example=true
             elif [ "$t" == "tii" ] || [ "$t" == "scaleway" ] || [ "$t" == "quantum_machines" ] || \
                  [ "$t" == "quantinuum" ] || [ "$t" == "orca" ] || [ "$t" == "orca-photonics" ] || \
-                 [ "$t" == "iqm" ] || [ "$t" == "infleqtion" ] || [ "$t" == "anyon" ]; then
+                 [ "$t" == "iqm" ] || [ "$t" == "infleqtion" ] || [ "$t" == "anyon" ] || \
+                 [ "$t" == "qbraid" ]; then
                 echo "Skipping $ex (remote target '$t' not available)" >&2
                 skip_example=true
             fi
diff --git a/unittests/backends/CMakeLists.txt b/unittests/backends/CMakeLists.txt
index 627ae6a7395..130fc4ecb0b 100644
--- a/unittests/backends/CMakeLists.txt
+++ b/unittests/backends/CMakeLists.txt
@@ -97,9 +97,7 @@ if (OPENSSL_FOUND AND CUDAQ_ENABLE_PYTHON AND CUDAQ_TEST_MOCK_SERVERS)
   if (CUDAQ_ENABLE_SCALEWAY_BACKEND)
     add_subdirectory(scaleway)
   endif()
-  if (CUDAQ_ENABLE_QBRAID_BACKEND)
-    add_subdirectory(qbraid)
-  endif()
+  add_subdirectory(qbraid)
   add_subdirectory(extra_payload_provider)
   add_subdirectory(quake_backend)
 endif()

From 38830db80134c75fa1a16302a2c81de8914bc951 Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Tue, 21 Apr 2026 07:32:48 +0000
Subject: [PATCH 30/85] fix: target tests for qbraid to use execution test
 pattern and make apiKey target param depend on emulate flag

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 .../helpers/qbraid/QbraidServerHelper.cpp     |  71 ++++++++----
 targettests/execution/bug_qubit.cpp           |   1 +
 targettests/execution/callable_kernel_arg.cpp |   1 +
 targettests/execution/cudaq_observe.cpp       |   1 +
 targettests/execution/if_jit.cpp              |   1 +
 targettests/execution/int8_t.cpp              |   1 +
 targettests/execution/int8_t_free_func.cpp    |   1 +
 targettests/execution/load_value.cpp          |   1 +
 targettests/execution/sudoku_2x2-1.cpp        |   1 +
 targettests/execution/sudoku_2x2-bit_name.cpp |   1 +
 targettests/execution/sudoku_2x2-reg_name.cpp |   1 +
 targettests/execution/sudoku_2x2.cpp          |   1 +
 targettests/execution/swap_gate.cpp           |   1 +
 targettests/execution/variable_size_qreg.cpp  |   1 +
 targettests/qbraid/bug_qubit.cpp              |  44 +-------
 targettests/qbraid/callable_kernel_arg.cpp    |  44 +-------
 targettests/qbraid/cudaq_observe.cpp          |  51 +--------
 targettests/qbraid/if_jit.cpp                 |  39 +------
 targettests/qbraid/int8_t.cpp                 |  10 ++
 targettests/qbraid/int8_t_free_func.cpp       |  10 ++
 targettests/qbraid/load_value.cpp             |  57 +---------
 targettests/qbraid/sudoku_2x2-1.cpp           |  73 +------------
 targettests/qbraid/sudoku_2x2-bit_name.cpp    |  10 ++
 targettests/qbraid/sudoku_2x2-bit_names.cpp   | 103 ------------------
 targettests/qbraid/sudoku_2x2-reg_name.cpp    |  73 +------------
 targettests/qbraid/sudoku_2x2.cpp             |  72 +-----------
 targettests/qbraid/swap_gate.cpp              |  37 +------
 targettests/qbraid/test-int8_t.cpp            |  48 --------
 targettests/qbraid/test-int8_t_free_func.cpp  |  46 --------
 targettests/qbraid/variable_size_qreg.cpp     |  40 +------
 30 files changed, 112 insertions(+), 729 deletions(-)
 create mode 100644 targettests/qbraid/int8_t.cpp
 create mode 100644 targettests/qbraid/int8_t_free_func.cpp
 create mode 100644 targettests/qbraid/sudoku_2x2-bit_name.cpp
 delete mode 100644 targettests/qbraid/sudoku_2x2-bit_names.cpp
 delete mode 100644 targettests/qbraid/test-int8_t.cpp
 delete mode 100644 targettests/qbraid/test-int8_t_free_func.cpp

diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index 75adbb12594..e9ec8b46e5f 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -17,14 +17,20 @@
 
 namespace cudaq {
 
+/// @brief The QbraidServerHelper class extends the ServerHelper class to
+/// handle interactions with the qBraid server for submitting and retrieving
+/// quantum computation jobs to various qBraid supported devices.
 class QbraidServerHelper : public ServerHelper {
   static constexpr const char *DEFAULT_URL = "https://api-v2.qbraid.com/api/v1";
   static constexpr const char *DEFAULT_DEVICE = "qbraid:qbraid:sim:qir-sv";
   static constexpr int DEFAULT_QUBITS = 30;
 
 public:
+  /// @brief Returns the name of the server helper.
   const std::string name() const override { return "qbraid"; }
 
+  /// @brief Initializes the server helper with the provided backend
+  /// configuration.
   void initialize(BackendConfig config) override {
     cudaq::info("Initializing qBraid Backend.");
 
@@ -44,10 +50,17 @@ class QbraidServerHelper : public ServerHelper {
 
     // Accept api_key from target arguments, fall back to QBRAID_API_KEY env var
     // Usage: cudaq.set_target("qbraid", api_key="my-key")
+    bool isApiKeyRequired = [&]() {
+      auto it = config.find("emulate");
+      if (it != config.end() && it->second == "true")
+        return false;
+      return true;
+    }();
     if (!config["api_key"].empty()) {
       backendConfig["api_key"] = config["api_key"];
     } else {
-      backendConfig["api_key"] = getEnvVar("QBRAID_API_KEY", "", true);
+      backendConfig["api_key"] =
+          getEnvVar("QBRAID_API_KEY", "", isApiKeyRequired);
     }
     backendConfig["job_path"] = backendConfig["url"] + "/jobs";
 
@@ -71,6 +84,8 @@ class QbraidServerHelper : public ServerHelper {
     }
   }
 
+  /// @brief Creates a quantum computation job using the provided kernel
+  /// executions and returns the corresponding payload.
   ServerJobPayload
   createJob(std::vector<KernelExecution> &circuitCodes) override {
     if (backendConfig.find("job_path") == backendConfig.end()) {
@@ -102,6 +117,7 @@ class QbraidServerHelper : public ServerHelper {
     return std::make_tuple(backendConfig.at("job_path"), getHeaders(), jobs);
   }
 
+  /// @brief Extracts the job ID from the server's response to a job submission.
   std::string extractJobId(ServerMessage &postResponse) override {
     // v2 API: jobQrn is nested under data envelope
     if (postResponse.contains("data") &&
@@ -112,6 +128,8 @@ class QbraidServerHelper : public ServerHelper {
         "ServerMessage doesn't contain 'data.jobQrn' key.");
   }
 
+  /// @brief Constructs the URL for retrieving a job based on the server's
+  /// response to a job submission.
   std::string constructGetJobPath(ServerMessage &postResponse) override {
     // v2 API: use path parameter instead of query parameter
     if (postResponse.contains("data") &&
@@ -123,16 +141,21 @@ class QbraidServerHelper : public ServerHelper {
         "ServerMessage doesn't contain 'data.jobQrn' key.");
   }
 
+  /// @brief Constructs the URL for retrieving a job based on a job ID.
   std::string constructGetJobPath(std::string &jobId) override {
     // v2 API: /jobs/{jobQrn}
     return backendConfig.at("job_path") + "/" + jobId;
   }
 
+  /// @brief Constructs the URL for retrieving the measurement results of a
+  /// completed job based on a job ID.
   std::string constructGetResultsPath(const std::string &jobId) {
     // v2 API: /jobs/{jobQrn}/result
     return backendConfig.at("job_path") + "/" + jobId + "/result";
   }
 
+  /// @brief Checks if a job is done based on the server's response to a job
+  /// retrieval request.
   bool jobIsDone(ServerMessage &getJobResponse) override {
     std::string status;
 
@@ -157,17 +180,16 @@ class QbraidServerHelper : public ServerHelper {
     return false;
   }
 
-  // Fetch results from v2 results endpoint with retry logic.
-  //
-  // Rationale: qbraid's v2 API has a window where status transitions to
-  // COMPLETED before the result payload is queryable on /result, so /result
-  // returns {success: false, data: {message: "not yet available"}}. The retry
-  // with backoff absorbs that race.
-  //
-  // Exercised deterministically via the mock's POST /test/delay_next_results
-  // endpoint (see checkResultRetry / checkResultRetryExhaustion tests).
+  /// @brief Processes the server's response to a job retrieval request and
+  /// maps the results back to sample results.
   cudaq::sample_result processResults(ServerMessage &getJobResponse,
                                       std::string &jobId) override {
+    // qbraid's v2 API has a window where status transitions to COMPLETED
+    // before the result payload is queryable on /result, so /result returns
+    // {success: false, data: {message: "not yet available"}}. Retry with
+    // backoff absorbs that race. Exercised deterministically via the mock's
+    // POST /test/delay_next_results endpoint (see checkResultRetry /
+    // checkResultRetryExhaustion tests).
     const int maxRetries = 3;
     const int waitTime = 2;
     const float backoffFactor = 2.0;
@@ -252,19 +274,18 @@ class QbraidServerHelper : public ServerHelper {
   }
 
 private:
-  // Merge multiple single-bit classical registers emitted by nvq++'s QASM 2
-  // codegen into a single multi-bit `creg c[N]`. This is required to unblock
-  // qBraid-routed hardware backends.
-  //
-  // Context: nvq++ emits one `creg varK[1];` per measurement. AWS Braket's
-  // classical simulators (SV1, DM1, TN1) tolerate that via lenient register
-  // concatenation, but stricter hardware transpilers below reject it:
-  //   - IQM (Garnet etc.): returns only the first register -> 1-bit results
-  //   - Rigetti: collapses all registers onto b[0] -> "bit already in use"
-  //   - IonQ-via-Braket: similar strict behavior
-  // Normalizing to a single register is the canonical QASM 2 form and is
-  // accepted uniformly by every qBraid-reachable backend.
+  /// @brief Merges multiple single-bit classical registers emitted by nvq++'s
+  /// QASM 2 codegen into a single multi-bit `creg c[N]`.
   std::string normalizeClassicalRegisters(const std::string &qasm) const {
+    // Required to unblock qBraid-routed hardware backends. nvq++ emits one
+    // `creg varK[1];` per measurement. AWS Braket's classical simulators
+    // (SV1, DM1, TN1) tolerate that via lenient register concatenation, but
+    // stricter hardware transpilers below reject it:
+    //   - IQM (Garnet etc.): returns only the first register -> 1-bit results
+    //   - Rigetti: collapses all registers onto b[0] -> "bit already in use"
+    //   - IonQ-via-Braket: similar strict behavior
+    // Normalizing to a single register is the canonical QASM 2 form and is
+    // accepted uniformly by every qBraid-reachable backend.
     static const std::regex cregDeclRx(R"(creg\s+(\w+)\s*\[\s*(\d+)\s*\]\s*;)");
 
     std::vector<std::pair<std::string, int>> cregs;
@@ -317,6 +338,7 @@ class QbraidServerHelper : public ServerHelper {
     return out;
   }
 
+  /// @brief Returns the headers for the server requests.
   RestHeaders getHeaders() override {
     if (backendConfig.find("api_key") == backendConfig.end()) {
       throw std::runtime_error(
@@ -330,6 +352,7 @@ class QbraidServerHelper : public ServerHelper {
     return headers;
   }
 
+  /// @brief Helper method to retrieve the value of an environment variable.
   std::string getEnvVar(const std::string &key, const std::string &defaultVal,
                         const bool isRequired) const {
     const char *env_var = std::getenv(key.c_str());
@@ -343,6 +366,8 @@ class QbraidServerHelper : public ServerHelper {
     return std::string(env_var);
   }
 
+  /// @brief Helper function to get a value from config or return a default
+  /// value.
   std::string getValueOrDefault(const BackendConfig &config,
                                 const std::string &key,
                                 const std::string &defaultValue) const {
@@ -351,4 +376,6 @@ class QbraidServerHelper : public ServerHelper {
 };
 } // namespace cudaq
 
+// Register the QbraidServerHelper with the name "qbraid" in the ServerHelper
+// factory
 CUDAQ_REGISTER_TYPE(cudaq::ServerHelper, cudaq::QbraidServerHelper, qbraid)
diff --git a/targettests/execution/bug_qubit.cpp b/targettests/execution/bug_qubit.cpp
index 6b33d51778c..d3b3d01d59e 100644
--- a/targettests/execution/bug_qubit.cpp
+++ b/targettests/execution/bug_qubit.cpp
@@ -17,6 +17,7 @@
 // RUN: IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t
 // RUN: IQM_QPU_QA=%iqm_tests_dir/Crystal_54.txt %t
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t; fi
diff --git a/targettests/execution/callable_kernel_arg.cpp b/targettests/execution/callable_kernel_arg.cpp
index 7eeca0e5bbc..a036b046c5a 100644
--- a/targettests/execution/callable_kernel_arg.cpp
+++ b/targettests/execution/callable_kernel_arg.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/cudaq_observe.cpp b/targettests/execution/cudaq_observe.cpp
index a28f7537f2e..230775628f9 100644
--- a/targettests/execution/cudaq_observe.cpp
+++ b/targettests/execution/cudaq_observe.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq            --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm             --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc             --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid          --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum      --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/if_jit.cpp b/targettests/execution/if_jit.cpp
index 7f3eb72205d..9bc39c2e3be 100644
--- a/targettests/execution/if_jit.cpp
+++ b/targettests/execution/if_jit.cpp
@@ -14,6 +14,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/int8_t.cpp b/targettests/execution/int8_t.cpp
index 8323b5f0acb..d38bfd799d7 100644
--- a/targettests/execution/int8_t.cpp
+++ b/targettests/execution/int8_t.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/int8_t_free_func.cpp b/targettests/execution/int8_t_free_func.cpp
index 0cf8f4bd156..8a7642813aa 100644
--- a/targettests/execution/int8_t_free_func.cpp
+++ b/targettests/execution/int8_t_free_func.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/load_value.cpp b/targettests/execution/load_value.cpp
index 1d1412980b7..46513671e84 100644
--- a/targettests/execution/load_value.cpp
+++ b/targettests/execution/load_value.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt  %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/sudoku_2x2-1.cpp b/targettests/execution/sudoku_2x2-1.cpp
index 0ee64a18855..df05df508fa 100644
--- a/targettests/execution/sudoku_2x2-1.cpp
+++ b/targettests/execution/sudoku_2x2-1.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/sudoku_2x2-bit_name.cpp b/targettests/execution/sudoku_2x2-bit_name.cpp
index 809e237dda3..5ecff676380 100644
--- a/targettests/execution/sudoku_2x2-bit_name.cpp
+++ b/targettests/execution/sudoku_2x2-bit_name.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/sudoku_2x2-reg_name.cpp b/targettests/execution/sudoku_2x2-reg_name.cpp
index a75e6f04d0e..6fc79267b65 100644
--- a/targettests/execution/sudoku_2x2-reg_name.cpp
+++ b/targettests/execution/sudoku_2x2-reg_name.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/sudoku_2x2.cpp b/targettests/execution/sudoku_2x2.cpp
index ff3906f2595..b86eddcbead 100644
--- a/targettests/execution/sudoku_2x2.cpp
+++ b/targettests/execution/sudoku_2x2.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_20.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
diff --git a/targettests/execution/swap_gate.cpp b/targettests/execution/swap_gate.cpp
index e9d8092dd56..e836b58f99a 100644
--- a/targettests/execution/swap_gate.cpp
+++ b/targettests/execution/swap_gate.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/execution/variable_size_qreg.cpp b/targettests/execution/variable_size_qreg.cpp
index 9844855ffc8..9d6f35f0adc 100644
--- a/targettests/execution/variable_size_qreg.cpp
+++ b/targettests/execution/variable_size_qreg.cpp
@@ -12,6 +12,7 @@
 // RUN: nvq++ --target ionq       --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target iqm        --emulate %s -o %t && IQM_QPU_QA=%iqm_tests_dir/Crystal_5.txt %t | FileCheck %s
 // RUN: nvq++ --target oqc        --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --target qbraid     --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum --emulate %s -o %t && %t | FileCheck %s
 // RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: if %qci_avail; then nvq++ --target qci --emulate %s -o %t && %t | FileCheck %s; fi
diff --git a/targettests/qbraid/bug_qubit.cpp b/targettests/qbraid/bug_qubit.cpp
index 2f53f71dd06..05533521413 100644
--- a/targettests/qbraid/bug_qubit.cpp
+++ b/targettests/qbraid/bug_qubit.cpp
@@ -6,45 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// This code is from Issue 251.
-
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// RUN: cudaq-quake %cpp_std %s | cudaq-opt --promote-qubit-allocation | FileCheck --check-prefixes=MLIR %s
-
-#include <cudaq.h>
-#include <iostream>
-
-struct simple_x {
-  void operator()() __qpu__ {
-    cudaq::qubit q;
-    x(q);
-    mz(q);
-  }
-};
-
-// MLIR-LABEL:   func.func @__nvqpp__mlirgen__simple_x()
-// MLIR-NOT:       quake.alloca !quake.ref
-// MLIR:           %[[VAL_0:.*]] = quake.alloca !quake.veq<1>
-// MLIR-NEXT:      %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<1>) -> !quake.ref
-
-int main() {
-  auto result = cudaq::sample(simple_x{});
-
-#ifndef SYNTAX_CHECK
-  std::cout << result.most_probable() << '\n';
-  assert("1" == result.most_probable());
-#endif
-
-  return 0;
-}
-
-// CHECK: 1
+// RUN: echo skipping
+#include "../execution/bug_qubit.cpp"
diff --git a/targettests/qbraid/callable_kernel_arg.cpp b/targettests/qbraid/callable_kernel_arg.cpp
index afa5fd8d960..7a6ca74ee20 100644
--- a/targettests/qbraid/callable_kernel_arg.cpp
+++ b/targettests/qbraid/callable_kernel_arg.cpp
@@ -6,45 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then  nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// clang-format on
-
-#include <cudaq.h>
-#include <iostream>
-
-__qpu__ void bar(cudaq::qubit &q) { x(q); }
-
-struct baz {
-  __qpu__ void operator()(cudaq::qubit &q) { x(q); }
-};
-
-struct foo {
-  template <typename CallableKernel>
-  __qpu__ void operator()(CallableKernel &&func, int size) {
-    cudaq::qvector q(size);
-    func(q[0]);
-    auto result = mz(q[0]);
-  }
-};
-
-int main() {
-  auto result = cudaq::sample(1000, foo{}, baz{}, /*qreg size*/ 1);
-
-#ifndef SYNTAX_CHECK
-  std::cout << result.most_probable() << '\n';
-  assert("1" == result.most_probable());
-#endif
-
-  return 0;
-}
-
-// CHECK: 1
+// RUN: echo skipping
+#include "../execution/callable_kernel_arg.cpp"
diff --git a/targettests/qbraid/cudaq_observe.cpp b/targettests/qbraid/cudaq_observe.cpp
index 0415480c32e..1b75a817e14 100644
--- a/targettests/qbraid/cudaq_observe.cpp
+++ b/targettests/qbraid/cudaq_observe.cpp
@@ -6,52 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// REQUIRES: c++20
-// clang-format off
-// RUN: nvq++ %cpp_std --target infleqtion      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// 2 different IQM machines for 2 different topologies
-// RUN: nvq++ --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// clang-format on
-
-#include <cudaq.h>
-#include <cudaq/algorithm.h>
-
-// The example here shows a simple use case for the `cudaq::observe`
-// function in computing expected values of provided spin_ops.
-
-struct ansatz {
-  auto operator()(double theta) __qpu__ {
-    cudaq::qvector q(2);
-    x(q[0]);
-    ry(theta, q[1]);
-    x<cudaq::ctrl>(q[1], q[0]);
-  }
-};
-
-int main() {
-
-  // Build up your spin op algebraically
-  using namespace cudaq::spin;
-  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
-                     .21829 * z(0) - 6.125 * z(1);
-
-  // Make repeatable for shots-based emulation
-  cudaq::set_random_seed(13);
-
-  // Observe takes the kernel, the spin_op, and the concrete
-  // parameters for the kernel
-  double energy = cudaq::observe(ansatz{}, h, .59);
-  printf("Energy is %.16lf\n", energy);
-  return 0;
-}
-
-// Note: seeds 2 and 12 will push this to -2 instead of -1. All other seeds in
-// 1-100 range will be -1.x.
-
-// CHECK: Energy is -1.
+// RUN: echo skipping
+#include "../execution/cudaq_observe.cpp"
diff --git a/targettests/qbraid/if_jit.cpp b/targettests/qbraid/if_jit.cpp
index 2b763a955ff..3e916bb1e88 100644
--- a/targettests/qbraid/if_jit.cpp
+++ b/targettests/qbraid/if_jit.cpp
@@ -6,40 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// This code is from Issue 296.
-
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// clang-format on
-
-#include <cudaq.h>
-#include <iostream>
-
-__qpu__ void foo(bool value) {
-  cudaq::qubit q;
-  if (value)
-    x(q);
-
-  mz(q);
-}
-
-int main() {
-  auto result = cudaq::sample(100, foo, true);
-
-#ifndef SYNTAX_CHECK
-  std::cout << result.most_probable() << '\n';
-  assert("1" == result.most_probable());
-#endif
-
-  return 0;
-}
-
-// CHECK: 1
+// RUN: echo skipping
+#include "../execution/if_jit.cpp"
diff --git a/targettests/qbraid/int8_t.cpp b/targettests/qbraid/int8_t.cpp
new file mode 100644
index 00000000000..2c6751705ec
--- /dev/null
+++ b/targettests/qbraid/int8_t.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/int8_t.cpp"
diff --git a/targettests/qbraid/int8_t_free_func.cpp b/targettests/qbraid/int8_t_free_func.cpp
new file mode 100644
index 00000000000..7a29487abbb
--- /dev/null
+++ b/targettests/qbraid/int8_t_free_func.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/int8_t_free_func.cpp"
diff --git a/targettests/qbraid/load_value.cpp b/targettests/qbraid/load_value.cpp
index 6ed611e0685..e1aee9db9b5 100644
--- a/targettests/qbraid/load_value.cpp
+++ b/targettests/qbraid/load_value.cpp
@@ -6,58 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// clang-format on
-
-#include <cudaq.h>
-#include <iostream>
-
-__qpu__ void load_value(unsigned value) {
-  cudaq::qvector qubits(4);
-  for (std::size_t i = 0; i < 4; ++i) {
-    // Doesn't work, even with: `if (value)`
-    if (value & (1 << i))
-      x(qubits[3 - i]);
-  }
-
-  mz(qubits);
-}
-
-int main() {
-  for (auto i = 0; i < 16; ++i) {
-    auto result = cudaq::sample(1000, load_value, i);
-
-#ifndef SYNTAX_CHECK
-    std::cout << result.most_probable() << '\n';
-    assert(i == std::stoi(result.most_probable(), nullptr, 2));
-#endif
-  }
-  return 0;
-}
-
-// CHECK: 0000
-// CHECK-NEXT: 0001
-// CHECK-NEXT: 0010
-// CHECK-NEXT: 0011
-// CHECK-NEXT: 0100
-// CHECK-NEXT: 0101
-// CHECK-NEXT: 0110
-// CHECK-NEXT: 0111
-// CHECK-NEXT: 1000
-// CHECK-NEXT: 1001
-// CHECK-NEXT: 1010
-// CHECK-NEXT: 1011
-// CHECK-NEXT: 1100
-// CHECK-NEXT: 1101
-// CHECK-NEXT: 1110
-// CHECK-NEXT: 1111
+// RUN: echo skipping
+#include "../execution/load_value.cpp"
diff --git a/targettests/qbraid/sudoku_2x2-1.cpp b/targettests/qbraid/sudoku_2x2-1.cpp
index 952d93495f6..3fae8d26e6c 100644
--- a/targettests/qbraid/sudoku_2x2-1.cpp
+++ b/targettests/qbraid/sudoku_2x2-1.cpp
@@ -6,74 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// REQUIRES: c++20
-// clang-format off
-// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// clang-format on
-
-#include <algorithm>
-#include <cudaq.h>
-#include <iostream>
-#include <unordered_set>
-
-__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
-  h(qubits);
-  x(qubits);
-  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
-  x(qubits);
-  h(qubits);
-}
-
-__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
-  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
-  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
-}
-
-__qpu__ void grover() {
-  cudaq::qvector qubits(4);
-  cudaq::qubit ancilla;
-
-  // Initialization
-  x(ancilla);
-  h(ancilla);
-  h(qubits); // uniform initialization
-
-  // Don't work?:
-  for (int i = 0; i < 2; ++i) {
-    oracle(qubits, ancilla);
-    reflect_uniform(qubits);
-  }
-
-  mz(qubits);
-};
-
-int main() {
-  auto result = cudaq::sample(1000, grover);
-
-#ifndef SYNTAX_CHECK
-  std::vector<std::string> strings;
-  for (auto &&[bits, count] : result) {
-    strings.push_back(bits);
-  }
-  std::sort(strings.begin(), strings.end(), [&](auto &a, auto &b) {
-    return result.count(a) > result.count(b);
-  });
-  std::cout << strings[0] << '\n';
-  std::cout << strings[1] << '\n';
-
-  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
-  assert(most_probable.count("1001") == 1);
-  assert(most_probable.count("0110") == 1);
-#endif
-
-  return 0;
-}
-
-// CHECK-DAG: 1001
-// CHECK-DAG: 0110
+// RUN: echo skipping
+#include "../execution/sudoku_2x2-1.cpp"
diff --git a/targettests/qbraid/sudoku_2x2-bit_name.cpp b/targettests/qbraid/sudoku_2x2-bit_name.cpp
new file mode 100644
index 00000000000..f875955b7be
--- /dev/null
+++ b/targettests/qbraid/sudoku_2x2-bit_name.cpp
@@ -0,0 +1,10 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: echo skipping
+#include "../execution/sudoku_2x2-bit_name.cpp"
diff --git a/targettests/qbraid/sudoku_2x2-bit_names.cpp b/targettests/qbraid/sudoku_2x2-bit_names.cpp
deleted file mode 100644
index e0de3cdd4da..00000000000
--- a/targettests/qbraid/sudoku_2x2-bit_names.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// REQUIRES: c++20
-// clang-format off
-// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// clang-format on
-
-#include <cudaq.h>
-#include <algorithm>
-#include <iostream>
-#include <unordered_set>
-
-__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
-  h(qubits);
-  x(qubits);
-  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
-  x(qubits);
-  h(qubits);
-}
-
-__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
-  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
-  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
-}
-
-__qpu__ void grover() {
-  cudaq::qvector qubits(4);
-  cudaq::qubit ancilla;
-
-  // Initialization
-  x(ancilla);
-  h(ancilla);
-  h(qubits); // uniform initialization
-
-  oracle(qubits, ancilla);
-  reflect_uniform(qubits);
-  oracle(qubits, ancilla);
-  reflect_uniform(qubits);
-
-  auto groverQubits0 = mz(qubits[0]);
-  auto groverQubits1 = mz(qubits[1]);
-  auto groverQubits2 = mz(qubits[2]);
-  auto groverQubits3 = mz(qubits[3]);
-};
-
-int main() {
-  auto result = cudaq::sample(1000, grover);
-  result.dump();
-
-  auto& platform = cudaq::get_platform();
-  if (platform.is_remote() || platform.is_emulated()) {
-    // Make sure that the get_marginal() results for the individual register names
-    // match the subset of the bits from the global register.
-    // Note that this will fail if you only compile this in library mode.
-    auto numBits = result.begin()->first.size();
-    std::cout << "Checking " << numBits << " bits against global register\n";
-    for (size_t b = 0;  b < numBits; b++) {
-      auto regName = "groverQubits" + std::to_string(b);
-      auto valFromRegName = result.get_marginal({0}, regName);
-      auto valFromGlobal = result.get_marginal({b});
-      if (valFromRegName.to_map() != valFromGlobal.to_map()) {
-        std::cout << "--- MISMATCH DETECTED in bit " << b << " ---\n";
-        valFromRegName.dump();
-        valFromGlobal.dump();
-        // Mark test failure
-        assert(valFromRegName.to_map() == valFromGlobal.to_map());
-      }
-    }
-  }
-
-#ifndef SYNTAX_CHECK
-  std::vector<std::string> strings;
-  for (auto &&[bits, count] : result) {
-    strings.push_back(bits);
-  }
-  std::sort(strings.begin(), strings.end(), [&](auto& a, auto& b) {
-    return result.count(a) > result.count(b);
-  });
-  std::cout << strings[0] << '\n';
-  std::cout << strings[1] << '\n';
-
-  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
-  assert(most_probable.count("1001") == 1);
-  assert(most_probable.count("0110") == 1);
-#endif
-
-  return 0;
-}
-
-// CHECK-DAG: 1001
-// CHECK-DAG: 0110
diff --git a/targettests/qbraid/sudoku_2x2-reg_name.cpp b/targettests/qbraid/sudoku_2x2-reg_name.cpp
index eb42663a5bc..17a48caec48 100644
--- a/targettests/qbraid/sudoku_2x2-reg_name.cpp
+++ b/targettests/qbraid/sudoku_2x2-reg_name.cpp
@@ -6,74 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// REQUIRES: c++20
-// clang-format off
-// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// clang-format on
-
-#include <cudaq.h>
-#include <algorithm>
-#include <iostream>
-#include <unordered_set>
-
-__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
-  h(qubits);
-  x(qubits);
-  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
-  x(qubits);
-  h(qubits);
-}
-
-__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
-  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
-  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
-}
-
-__qpu__ void grover() {
-  cudaq::qvector qubits(4);
-  cudaq::qubit ancilla;
-
-  // Initialization
-  x(ancilla);
-  h(ancilla);
-  h(qubits); // uniform initialization
-
-  oracle(qubits, ancilla);
-  reflect_uniform(qubits);
-  oracle(qubits, ancilla);
-  reflect_uniform(qubits);
-
-  auto groverQubits = mz(qubits);
-};
-
-int main() {
-  auto result = cudaq::sample(1000, grover);
-  result.dump();
-
-#ifndef SYNTAX_CHECK
-  std::vector<std::string> strings;
-  for (auto &&[bits, count] : result) {
-    strings.push_back(bits);
-  }
-  std::sort(strings.begin(), strings.end(), [&](auto& a, auto& b) {
-    return result.count(a) > result.count(b);
-  });
-  std::cout << strings[0] << '\n';
-  std::cout << strings[1] << '\n';
-
-  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
-  assert(most_probable.count("1001") == 1);
-  assert(most_probable.count("0110") == 1);
-#endif
-
-  return 0;
-}
-
-// CHECK-DAG: 1001
-// CHECK-DAG: 0110
+// RUN: echo skipping
+#include "../execution/sudoku_2x2-reg_name.cpp"
diff --git a/targettests/qbraid/sudoku_2x2.cpp b/targettests/qbraid/sudoku_2x2.cpp
index a5cb54c0843..090b230072a 100644
--- a/targettests/qbraid/sudoku_2x2.cpp
+++ b/targettests/qbraid/sudoku_2x2.cpp
@@ -6,73 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// REQUIRES: c++20
-// clang-format off
-// RUN: nvq++ --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// clang-format on
-
-#include <cudaq.h>
-#include <algorithm>
-#include <iostream>
-#include <unordered_set>
-
-__qpu__ void reflect_uniform(cudaq::qvector<> &qubits) {
-  h(qubits);
-  x(qubits);
-  z<cudaq::ctrl>(qubits[0], qubits[1], qubits[2], qubits[3]);
-  x(qubits);
-  h(qubits);
-}
-
-__qpu__ void oracle(cudaq::qvector<> &cs, cudaq::qubit &target) {
-  x<cudaq::ctrl>(cs[0], !cs[1], !cs[2], cs[3], target);
-  x<cudaq::ctrl>(!cs[0], cs[1], cs[2], !cs[3], target);
-}
-
-__qpu__ void grover() {
-  cudaq::qvector qubits(4);
-  cudaq::qubit ancilla;
-
-  // Initialization
-  x(ancilla);
-  h(ancilla);
-  h(qubits); // uniform initialization
-
-  oracle(qubits, ancilla);
-  reflect_uniform(qubits);
-  oracle(qubits, ancilla);
-  reflect_uniform(qubits);
-
-  mz(qubits);
-};
-
-int main() {
-  auto result = cudaq::sample(1000, grover);
-
-#ifndef SYNTAX_CHECK
-  std::vector<std::string> strings;
-  for (auto &&[bits, count] : result) {
-    strings.push_back(bits);
-  }
-  std::sort(strings.begin(), strings.end(), [&](auto& a, auto& b) {
-    return result.count(a) > result.count(b);
-  });
-  std::cout << strings[0] << '\n';
-  std::cout << strings[1] << '\n';
-
-  std::unordered_set<std::string> most_probable{strings[0], strings[1]};
-  assert(most_probable.count("1001") == 1);
-  assert(most_probable.count("0110") == 1);
-#endif
-
-  return 0;
-}
-
-// CHECK-DAG: 1001
-// CHECK-DAG: 0110
+// RUN: echo skipping
+#include "../execution/sudoku_2x2.cpp"
diff --git a/targettests/qbraid/swap_gate.cpp b/targettests/qbraid/swap_gate.cpp
index 7331ecbd262..c592ce69b31 100644
--- a/targettests/qbraid/swap_gate.cpp
+++ b/targettests/qbraid/swap_gate.cpp
@@ -6,38 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t && %t | FileCheck %s
-
-#include "cudaq.h"
-#include <iostream>
-
-int main() {
-
-  auto swapKernel = []() __qpu__ {
-    cudaq::qvector q(2);
-    x(q[0]);
-    swap(q[0], q[1]);
-
-    mz(q);
-  };
-
-  auto counts = cudaq::sample(swapKernel);
-
-#ifndef SYNTAX_CHECK
-  std::cout << counts.most_probable() << '\n';
-  assert("01" == counts.most_probable());
-#endif
-
-  return 0;
-}
-
-// CHECK: 01
+// RUN: echo skipping
+#include "../execution/swap_gate.cpp"
diff --git a/targettests/qbraid/test-int8_t.cpp b/targettests/qbraid/test-int8_t.cpp
deleted file mode 100644
index 7f3cf8b63c3..00000000000
--- a/targettests/qbraid/test-int8_t.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// clang-format on
-
-#include <cudaq.h>
-#include <iostream>
-
-struct variable_qreg {
-  __qpu__ void operator()(std::uint8_t value) {
-    cudaq::qvector qubits(value);
-
-    mz(qubits);
-  }
-};
-
-int main() {
-  for (auto i = 1; i < 5; ++i) {
-    auto result = cudaq::sample(1000, variable_qreg{}, i);
-
-#ifndef SYNTAX_CHECK
-    std::cout << result.most_probable() << '\n';
-    assert(std::string(i, '0') == result.most_probable());
-#endif
-  }
-
-  return 0;
-}
-
-// CHECK: 0
-// CHECK: 00
-// CHECK: 000
-// CHECK: 0000
diff --git a/targettests/qbraid/test-int8_t_free_func.cpp b/targettests/qbraid/test-int8_t_free_func.cpp
deleted file mode 100644
index 6c3eea548d8..00000000000
--- a/targettests/qbraid/test-int8_t_free_func.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// clang-format on
-
-#include <cudaq.h>
-#include <iostream>
-
-__qpu__ void variable_qreg(std::uint8_t value) {
-  cudaq::qvector qubits(value);
-
-  mz(qubits);
-}
-
-int main() {
-  for (auto i = 1; i < 5; ++i) {
-    auto result = cudaq::sample(1000, variable_qreg, i);
-
-#ifndef SYNTAX_CHECK
-    std::cout << result.most_probable() << '\n';
-    assert(std::string(i, '0') == result.most_probable());
-#endif
-  }
-
-  return 0;
-}
-
-// CHECK: 0
-// CHECK-NEXT: 00
-// CHECK-NEXT: 000
-// CHECK-NEXT: 0000
diff --git a/targettests/qbraid/variable_size_qreg.cpp b/targettests/qbraid/variable_size_qreg.cpp
index 120fcede939..cc4845f4df9 100644
--- a/targettests/qbraid/variable_size_qreg.cpp
+++ b/targettests/qbraid/variable_size_qreg.cpp
@@ -6,41 +6,5 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// clang-format off
-// RUN: nvq++ %cpp_std --target anyon                    --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target infleqtion               --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target ionq                     --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target qbraid                   --emulate %s -o %t && %t | FileCheck %s
-// RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
-// RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
-// clang-format on
-
-#include <cudaq.h>
-#include <iostream>
-
-__qpu__ void variable_qreg(unsigned value) {
-  cudaq::qvector qubits(value);
-
-  mz(qubits);
-}
-
-int main() {
-  for (auto i = 1; i < 5; ++i) {
-    auto result = cudaq::sample(1000, variable_qreg, i);
-
-#ifndef SYNTAX_CHECK
-    std::cout << result.most_probable() << '\n';
-    assert(std::string(i, '0') == result.most_probable());
-#endif
-  }
-
-  return 0;
-}
-
-// CHECK: 0
-// CHECK-NEXT: 00
-// CHECK-NEXT: 000
-// CHECK-NEXT: 0000
+// RUN: echo skipping
+#include "../execution/variable_size_qreg.cpp"

From afb6f73361b11fda4e93bfd42153f41d3abae748 Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Tue, 21 Apr 2026 07:37:43 -0700
Subject: [PATCH 31/85] Increasing the number of colors (#4355)

Increasing the number of colors in the palette so VQE runs producing
upto 10 flat clusters at threshold height of 1.

Fixes:
https://github.com/NVIDIA/cuda-quantum/actions/runs/24686493909/job/72220359142#step:5:1914

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 .../applications/python/divisive_clustering_coresets.ipynb      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb b/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb
index b0b1fae4c5a..4ede8d321b7 100644
--- a/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb
+++ b/docs/sphinx/applications/python/divisive_clustering_coresets.ipynb
@@ -651,7 +651,7 @@
    "source": [
     "threshold_height = 1\n",
     "clusters = dendo.get_clusters_using_height(threshold_height)\n",
-    "colors = [\"red\", \"blue\", \"green\", \"black\", \"purple\", \"orange\", \"yellow\"]\n",
+    "colors = [\"red\", \"blue\", \"green\", \"black\", \"purple\", \"orange\", \"yellow\", \"cyan\", \"magenta\", \"brown\"]\n",
     "dendo.plot_dendrogram(\n",
     "    plot_title=\"Dendrogram of Coreset using VQE\",\n",
     "    colors=colors,\n",

From 9974f3c74e3af359b5ba7666179144e3b541d937 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Wed, 22 Apr 2026 03:37:35 +1000
Subject: [PATCH 32/85] Fixes for state creation within kernel execution
 (#4352)

With https://github.com/NVIDIA/cuda-quantum/pull/4247, we now create the
state in the middle of circuit execution.

**Note**: previously, the state is usually constructed outside, before
kernel execution.


Hence, the default implementation of `createStateFromData`, via
`getSimulationState`, is no longer appropriate because
`getSimulationState` has the semantics of the simulator giving up the
ownership its current state.


This extends the fix in https://github.com/NVIDIA/cuda-quantum/pull/4304
to all simulators by removing the default implementation that is no
longer valid; rather, requires subclasses to implement state creation
without interfering with the current simulator state.


Fixed https://github.com/NVIDIA/cuda-quantum/issues/4350

Publishing run:
https://github.com/NVIDIA/cuda-quantum/actions/runs/24686493909/ (Python
metapackage validation is passing now)

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
Co-authored-by: Sachin Pisal <spisal@nvidia.com>
---
 .github/workflows/config/gitlab_commits.txt   |   2 +-
 runtime/nvqir/CircuitSimulator.h              |  10 +-
 .../nvqir/cudensitymat/CuDensityMatSim.cpp    |   5 +
 .../custatevec/CuStateVecCircuitSimulator.cpp |   6 +
 runtime/nvqir/cutensornet/simulator_mps.h     |   8 ++
 .../nvqir/cutensornet/simulator_tensornet.h   |   7 ++
 runtime/nvqir/qpp/QppCircuitSimulator.cpp     |   3 +-
 runtime/nvqir/qpp/QppDMCircuitSimulator.cpp   |   3 +-
 .../nvqir/resourcecounter/ResourceCounter.h   |   6 +
 runtime/nvqir/stim/StimCircuitSimulator.cpp   |   7 ++
 .../execution/state_init_mid_circuit.cpp      | 116 ++++++++++++++++++
 11 files changed, 161 insertions(+), 12 deletions(-)
 create mode 100644 targettests/execution/state_init_mid_circuit.cpp

diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt
index c912a492963..e3e19dbe3f3 100644
--- a/.github/workflows/config/gitlab_commits.txt
+++ b/.github/workflows/config/gitlab_commits.txt
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: 52dbd7d31cf3c88c8e5a1de9bac6635a5b0c8309
+nvidia-mgpu-commit: 16b82e64ab6f1d14ed7162a8d2580b632271a89f
diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
index 025006cf901..224a3988f4d 100644
--- a/runtime/nvqir/CircuitSimulator.h
+++ b/runtime/nvqir/CircuitSimulator.h
@@ -123,6 +123,9 @@ class CircuitSimulator {
   /// @brief Provide a mechanism for simulators to
   /// create and return a `SimulationState` instance from
   /// a user-specified data set.
+  /// Note: this may be called in the middle of a circuit execution
+  /// (`CreateStateOp` in the IR), so implementations must not read from or
+  /// write to the simulator's own live state.
   virtual std::unique_ptr<cudaq::SimulationState>
   createStateFromData(const cudaq::state_data &) = 0;
 
@@ -845,13 +848,6 @@ class CircuitSimulatorBase : public CircuitSimulator {
   /// @brief The destructor
   virtual ~CircuitSimulatorBase() = default;
 
-  /// @brief Create a simulation-specific SimulationState
-  /// instance from a user-provided data set.
-  std::unique_ptr<cudaq::SimulationState>
-  createStateFromData(const cudaq::state_data &data) override {
-    return getSimulationState()->createFromData(data);
-  }
-
   /// @brief Set the current noise model to consider when
   /// simulating the state. This should be overridden by
   /// simulation strategies that support noise modeling.
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
index ff8f9fac12c..71be0cbc1c7 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
@@ -128,6 +128,11 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
     return std::make_unique<cudaq::CuDensityMatState>();
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<cudaq::CuDensityMatState>()->createFromData(data);
+  }
+
   void finalizeExecutionContext(cudaq::ExecutionContext &context) override {
     // Just check that the dynamics target was not invoked in gate simulation
     // contexts.
diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
index e89228d6eaa..1deb1f53347 100644
--- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
+++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
@@ -750,6 +750,12 @@ class CuStateVecCircuitSimulator
                                                           deviceStateVector);
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<cudaq::CusvState<ScalarType>>(0, nullptr)
+        ->createFromData(data);
+  }
+
   bool isStateVectorSimulator() const override { return true; }
 
   std::string name() const override;
diff --git a/runtime/nvqir/cutensornet/simulator_mps.h b/runtime/nvqir/cutensornet/simulator_mps.h
index 0228f95e0b0..cf1ea9b3f0c 100644
--- a/runtime/nvqir/cutensornet/simulator_mps.h
+++ b/runtime/nvqir/cutensornet/simulator_mps.h
@@ -461,6 +461,14 @@ class SimulatorMPS : public SimulatorTensorNetBase<ScalarType> {
         m_cutnHandle, m_randomEngine);
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<MPSSimulationState<ScalarType>>(
+               nullptr, std::vector<MPSTensor>{}, scratchPad, m_cutnHandle,
+               m_randomEngine)
+        ->createFromData(data);
+  }
+
   bool requireCacheWorkspace() const override { return false; }
   bool canHandleGeneralNoiseChannel() const override { return true; }
   virtual ~SimulatorMPS() noexcept {
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet.h b/runtime/nvqir/cutensornet/simulator_tensornet.h
index 3dda056d99e..801b3cf5a77 100644
--- a/runtime/nvqir/cutensornet/simulator_tensornet.h
+++ b/runtime/nvqir/cutensornet/simulator_tensornet.h
@@ -86,6 +86,13 @@ class SimulatorTensorNet : public SimulatorTensorNetBase<ScalarType> {
         std::move(m_state), scratchPad, m_cutnHandle, m_randomEngine);
   }
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &data) override {
+    return std::make_unique<TensorNetSimulationState<ScalarType>>(
+               nullptr, scratchPad, m_cutnHandle, m_randomEngine)
+        ->createFromData(data);
+  }
+
   void addQubitsToState(std::size_t numQubits, const void *ptr) override {
     LOG_API_TIME();
     if (!m_state) {
diff --git a/runtime/nvqir/qpp/QppCircuitSimulator.cpp b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
index 24d9c1262e5..61c46561d6e 100644
--- a/runtime/nvqir/qpp/QppCircuitSimulator.cpp
+++ b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
@@ -409,8 +409,7 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
 
   std::unique_ptr<cudaq::SimulationState>
   createStateFromData(const cudaq::state_data &data) override {
-    qpp::ket dummy;
-    return QppState(std::move(dummy)).createFromData(data);
+    return std::make_unique<QppState>(qpp::ket{})->createFromData(data);
   }
 
   bool isStateVectorSimulator() const override {
diff --git a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
index 263ce9cd809..e247581897d 100644
--- a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
+++ b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
@@ -319,8 +319,7 @@ class QppNoiseCircuitSimulator : public nvqir::QppCircuitSimulator<qpp::cmat> {
 
   std::unique_ptr<cudaq::SimulationState>
   createStateFromData(const cudaq::state_data &data) override {
-    qpp::cmat dummy;
-    return QppDmState(std::move(dummy)).createFromData(data);
+    return std::make_unique<QppDmState>(qpp::cmat{})->createFromData(data);
   }
 
   NVQIR_SIMULATOR_CLONE_IMPL(QppNoiseCircuitSimulator)
diff --git a/runtime/nvqir/resourcecounter/ResourceCounter.h b/runtime/nvqir/resourcecounter/ResourceCounter.h
index 34b95206d05..536e9f04b99 100644
--- a/runtime/nvqir/resourcecounter/ResourceCounter.h
+++ b/runtime/nvqir/resourcecounter/ResourceCounter.h
@@ -74,6 +74,12 @@ class ResourceCounter : public nvqir::CircuitSimulatorBase<double> {
 
   CircuitSimulator *clone() override { return this; };
 
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &) override {
+    throw std::runtime_error(
+        "Simulation data not available for the resource counter backend.");
+  }
+
   void deallocateStateImpl() override {}
 
   void setToZeroState() override {
diff --git a/runtime/nvqir/stim/StimCircuitSimulator.cpp b/runtime/nvqir/stim/StimCircuitSimulator.cpp
index 296f651dcb1..b62b137cfe6 100644
--- a/runtime/nvqir/stim/StimCircuitSimulator.cpp
+++ b/runtime/nvqir/stim/StimCircuitSimulator.cpp
@@ -610,6 +610,13 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
   bool isStateVectorSimulator() const override { return false; }
 
   std::string name() const override { return "stim"; }
+
+  std::unique_ptr<cudaq::SimulationState>
+  createStateFromData(const cudaq::state_data &) override {
+    throw std::runtime_error(
+        "Simulation data not available for the stim simulator backend.");
+  }
+
   NVQIR_SIMULATOR_CLONE_IMPL(StimCircuitSimulator)
 };
 
diff --git a/targettests/execution/state_init_mid_circuit.cpp b/targettests/execution/state_init_mid_circuit.cpp
new file mode 100644
index 00000000000..1d518d21d5c
--- /dev/null
+++ b/targettests/execution/state_init_mid_circuit.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Regression test for https://github.com/NVIDIA/cuda-quantum/issues/4350
+// (`createStateFromData` was called via `CreateStateOp` in the middle of kernel
+// execution, after qubits were already live in the simulator, corrupting the
+// active state).
+
+// Compile and execute; verifies the runtime produces the correct bitstrings.
+// RUN: nvq++ %s -o %t && %t | FileCheck %s
+
+// Lower to Quake IR and verify the expected IR structure.
+// RUN: cudaq-quake %s | cudaq-opt | FileCheck %s --check-prefix=MLIR
+
+#include <cudaq.h>
+#include <iostream>
+#include <vector>
+
+// Allocate one qubit first, then allocate more qubits from a state vector.
+// The second qalloc emits `quake.create_state` which calls
+// `createStateFromData` while the simulator already owns the live state for
+// `p`.
+__qpu__ void test_single_then_state(std::vector<cudaq::complex> inState) {
+  cudaq::qubit p;
+  cudaq::qvector q{cudaq::state(inState)};
+  mz(p);
+  mz(q);
+}
+
+// Same pattern with a multi-qubit register allocated first.
+__qpu__ void test_multi_then_state(std::vector<cudaq::complex> inState) {
+  cudaq::qvector p(2);
+  cudaq::qvector q{cudaq::state(inState)};
+  mz(p);
+  mz(q);
+}
+
+int main() {
+  constexpr int shots = 1000;
+  // State vector for |11> (2 qubits, 4 amplitudes, last = 1.0).
+  std::vector<cudaq::complex> state2q{0., 0., 0., 1.};
+
+  {
+    // p starts in |0>; q is initialized to |11>.
+    // All shots must produce "011".
+    auto counts = cudaq::sample(shots, test_single_then_state, state2q);
+    std::cout << "single qubit + state: ";
+    counts.dump();
+    if (counts.count("011") != static_cast<std::size_t>(shots)) {
+      std::cerr << "FAIL: expected all " << shots << " shots as '011', got "
+                << counts.count("011") << "\n";
+      return 1;
+    }
+    std::cout << "single_then_state PASSED\n";
+  }
+
+  {
+    // p[0],p[1] start in |00>; q is initialized to |11>.
+    // All shots must produce "0011".
+    auto counts = cudaq::sample(shots, test_multi_then_state, state2q);
+    std::cout << "multi-qubit + state: ";
+    counts.dump();
+    if (counts.count("0011") != static_cast<std::size_t>(shots)) {
+      std::cerr << "FAIL: expected all " << shots << " shots as '0011', got "
+                << counts.count("0011") << "\n";
+      return 1;
+    }
+    std::cout << "multi_then_state PASSED\n";
+  }
+
+  return 0;
+}
+
+// CHECK: single_then_state PASSED
+// CHECK: multi_then_state PASSED
+
+// Verify the mid-execution state-init IR: `quake.alloca` for the prior
+// qubits appears before `quake.create_state`, i.e., `createStateFromData` call
+// while the simulator already owns live qubits.
+
+// The mangled suffix after the dot is compiler-dependent (GCC vs Clang libc++),
+// so the LABEL stops at the common prefix.
+// clang-format off
+// MLIR-LABEL: func.func @__nvqpp__mlirgen__function_test_single_then_state.
+// MLIR:         %[[VAL_1:.*]] = quake.alloca !quake.ref
+// MLIR:         %[[VAL_2:.*]] = cc.stdvec_data %[[VAL_0:.*]] : (!cc.stdvec<complex<f64>>) -> !cc.ptr<complex<f64>>
+// MLIR:         %[[VAL_3:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<complex<f64>>) -> i64
+// MLIR:         %[[VAL_4:.*]] = quake.create_state %[[VAL_2]], %[[VAL_3]] : (!cc.ptr<complex<f64>>, i64) -> !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_5:.*]] = quake.get_number_of_qubits %[[VAL_4]] : (!cc.ptr<!quake.state>) -> i64
+// MLIR:         %[[VAL_6:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_5]] : i64]
+// MLIR:         %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_4]] : (!quake.veq<?>, !cc.ptr<!quake.state>) -> !quake.veq<?>
+// MLIR:         quake.delete_state %[[VAL_4]] : !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_8:.*]] = quake.mz %[[VAL_1]] : (!quake.ref) -> !quake.measure
+// MLIR:         %[[VAL_9:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// MLIR:         return
+// MLIR:       }
+
+// MLIR-LABEL: func.func @__nvqpp__mlirgen__function_test_multi_then_state.
+// MLIR:         %[[VAL_1:.*]] = quake.alloca !quake.veq<2>
+// MLIR:         %[[VAL_2:.*]] = cc.stdvec_data %[[VAL_0:.*]] : (!cc.stdvec<complex<f64>>) -> !cc.ptr<complex<f64>>
+// MLIR:         %[[VAL_3:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<complex<f64>>) -> i64
+// MLIR:         %[[VAL_4:.*]] = quake.create_state %[[VAL_2]], %[[VAL_3]] : (!cc.ptr<complex<f64>>, i64) -> !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_5:.*]] = quake.get_number_of_qubits %[[VAL_4]] : (!cc.ptr<!quake.state>) -> i64
+// MLIR:         %[[VAL_6:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_5]] : i64]
+// MLIR:         %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_4]] : (!quake.veq<?>, !cc.ptr<!quake.state>) -> !quake.veq<?>
+// MLIR:         quake.delete_state %[[VAL_4]] : !cc.ptr<!quake.state>
+// MLIR:         %[[VAL_8:.*]] = quake.mz %[[VAL_1]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// MLIR:         %[[VAL_9:.*]] = quake.mz %[[VAL_7]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
+// MLIR:         return
+// MLIR:       }
+// clang-format on

From 634737f78cbfdfc76b7190b1cced4aa58e3ef566 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 Apr 2026 19:37:42 +0000
Subject: [PATCH 33/85] Bump nbconvert from 7.17.0 to 7.17.1 (#4358)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [nbconvert](https://github.com/jupyter/nbconvert) from 7.17.0 to
7.17.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/jupyter/nbconvert/releases">nbconvert's
releases</a>.</em></p>
<blockquote>
<h2>v7.17.1</h2>
<h2>7.17.1</h2>
<p>This is a security release, fixing two CVEs:</p>
<ul>
<li><a
href="https://github.com/jupyter/nbconvert/security/advisories/GHSA-4c99-qj7h-p3vg">CVE-2026-39377</a></li>
<li><a
href="https://github.com/jupyter/nbconvert/security/advisories/GHSA-7jqv-fw35-gmx9">CVE-2026-39378</a></li>
</ul>
<p>(full advisories will be published seven days after release, on
2026-04-14).</p>
<p>(<a
href="https://github.com/jupyter/nbconvert/compare/v7.17.0...b3b6ec01f872e9af8fd1769eb9cf1889c720ecf3">Full
Changelog</a>)</p>
<h3>Enhancements made</h3>
<ul>
<li>Allow configureable WebPDF JavaScript processing timeout <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2250">#2250</a>
(<a href="https://github.com/timkpaine"><code>@​timkpaine</code></a>, <a
href="https://github.com/Carreau"><code>@​Carreau</code></a>)</li>
</ul>
<h3>Bugs fixed</h3>
<ul>
<li>Fix <code>PermissionError</code> when checking template paths on
shared filesystems <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2252">#2252</a>
(<a href="https://github.com/ctcjab"><code>@​ctcjab</code></a>, <a
href="https://github.com/krassowski"><code>@​krassowski</code></a>)</li>
<li>Tweak webpdf template logic to fix duplicate extension problem <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2249">#2249</a>
(<a href="https://github.com/timkpaine"><code>@​timkpaine</code></a>, <a
href="https://github.com/Carreau"><code>@​Carreau</code></a>)</li>
</ul>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>specify python version for pre <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2276">#2276</a>
(<a href="https://github.com/minrk"><code>@​minrk</code></a>, <a
href="https://github.com/krassowski"><code>@​krassowski</code></a>)</li>
</ul>
<h3>Contributors to this release</h3>
<p>The following people contributed discussions, new ideas, code and
documentation contributions, and review.
See <a
href="https://github-activity.readthedocs.io/en/latest/use/#how-does-this-tool-define-contributions-in-the-reports">our
definition of contributors</a>.</p>
<p>(<a
href="https://github.com/jupyter/nbconvert/graphs/contributors?from=2026-01-29&amp;to=2026-04-08&amp;type=c">GitHub
contributors page for this release</a>)</p>
<p><a href="https://github.com/akhmerov"><code>@​akhmerov</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Aakhmerov+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/bollwyvl"><code>@​bollwyvl</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Abollwyvl+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/Carreau"><code>@​Carreau</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3ACarreau+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/ctcjab"><code>@​ctcjab</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Actcjab+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a
href="https://github.com/davidbrochart"><code>@​davidbrochart</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Adavidbrochart+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/Ken-B"><code>@​Ken-B</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3AKen-B+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/krassowski"><code>@​krassowski</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Akrassowski+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/mgeier"><code>@​mgeier</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Amgeier+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/minrk"><code>@​minrk</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Aminrk+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/mpacer"><code>@​mpacer</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Ampacer+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/MSeal"><code>@​MSeal</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3AMSeal+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a
href="https://github.com/SylvainCorlay"><code>@​SylvainCorlay</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3ASylvainCorlay+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/takluyver"><code>@​takluyver</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Atakluyver+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/timkpaine"><code>@​timkpaine</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Atimkpaine+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/jupyter/nbconvert/blob/main/CHANGELOG.md">nbconvert's
changelog</a>.</em></p>
<blockquote>
<h2>7.17.1</h2>
<p>This is a security release, fixing two CVEs:</p>
<ul>
<li><a
href="https://github.com/jupyter/nbconvert/security/advisories/GHSA-4c99-qj7h-p3vg">CVE-2026-39377</a></li>
<li><a
href="https://github.com/jupyter/nbconvert/security/advisories/GHSA-7jqv-fw35-gmx9">CVE-2026-39378</a></li>
</ul>
<p>(full advisories will be published seven days after release, on
2026-04-14).</p>
<p>(<a
href="https://github.com/jupyter/nbconvert/compare/v7.17.0...b3b6ec01f872e9af8fd1769eb9cf1889c720ecf3">Full
Changelog</a>)</p>
<h3>Enhancements made</h3>
<ul>
<li>Allow configureable WebPDF JavaScript processing timeout <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2250">#2250</a>
(<a href="https://github.com/timkpaine"><code>@​timkpaine</code></a>, <a
href="https://github.com/Carreau"><code>@​Carreau</code></a>)</li>
</ul>
<h3>Bugs fixed</h3>
<ul>
<li>Fix <code>PermissionError</code> when checking template paths on
shared filesystems <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2252">#2252</a>
(<a href="https://github.com/ctcjab"><code>@​ctcjab</code></a>, <a
href="https://github.com/krassowski"><code>@​krassowski</code></a>)</li>
<li>Tweak webpdf template logic to fix duplicate extension problem <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2249">#2249</a>
(<a href="https://github.com/timkpaine"><code>@​timkpaine</code></a>, <a
href="https://github.com/Carreau"><code>@​Carreau</code></a>)</li>
</ul>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>specify python version for pre <a
href="https://redirect.github.com/jupyter/nbconvert/pull/2276">#2276</a>
(<a href="https://github.com/minrk"><code>@​minrk</code></a>, <a
href="https://github.com/krassowski"><code>@​krassowski</code></a>)</li>
</ul>
<h3>Contributors to this release</h3>
<p>The following people contributed discussions, new ideas, code and
documentation contributions, and review.
See <a
href="https://github-activity.readthedocs.io/en/latest/use/#how-does-this-tool-define-contributions-in-the-reports">our
definition of contributors</a>.</p>
<p>(<a
href="https://github.com/jupyter/nbconvert/graphs/contributors?from=2026-01-29&amp;to=2026-04-08&amp;type=c">GitHub
contributors page for this release</a>)</p>
<p><a href="https://github.com/akhmerov"><code>@​akhmerov</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Aakhmerov+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/bollwyvl"><code>@​bollwyvl</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Abollwyvl+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/Carreau"><code>@​Carreau</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3ACarreau+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/ctcjab"><code>@​ctcjab</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Actcjab+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a
href="https://github.com/davidbrochart"><code>@​davidbrochart</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Adavidbrochart+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/Ken-B"><code>@​Ken-B</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3AKen-B+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/krassowski"><code>@​krassowski</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Akrassowski+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/mgeier"><code>@​mgeier</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Amgeier+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/minrk"><code>@​minrk</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Aminrk+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/mpacer"><code>@​mpacer</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Ampacer+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/MSeal"><code>@​MSeal</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3AMSeal+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a
href="https://github.com/SylvainCorlay"><code>@​SylvainCorlay</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3ASylvainCorlay+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/takluyver"><code>@​takluyver</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Atakluyver+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)
| <a href="https://github.com/timkpaine"><code>@​timkpaine</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnbconvert+involves%3Atimkpaine+updated%3A2026-01-29..2026-04-08&amp;type=Issues">activity</a>)</p>
<!-- raw HTML omitted -->
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/jupyter/nbconvert/commit/78ed30837a607deab7cf0a12dca072bf3f63417a"><code>78ed308</code></a>
Publish 7.17.1</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/f090a64606fde4c7f87cbf478f51b4aa46a425ec"><code>f090a64</code></a>
ruff format</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/b3b6ec01f872e9af8fd1769eb9cf1889c720ecf3"><code>b3b6ec0</code></a>
chore: update pre-commit hooks (<a
href="https://redirect.github.com/jupyter/nbconvert/issues/2277">#2277</a>)</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/be4841f7da51c499b1937e41e6e71926dbe0daa3"><code>be4841f</code></a>
ignore silly security lint in tests</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/26d57b295870f5572d9bd0535acee4a120339c10"><code>26d57b2</code></a>
fix type annotation on Lexer</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/0e6b8ccabf2aca6c18fac8c574f22b7155f441fb"><code>0e6b8cc</code></a>
Merge commit from fork</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/ba5e5cdd737704388251fa55fa9e58f5752fa39d"><code>ba5e5cd</code></a>
Merge commit from fork</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/1db0c88d865146ce02b6405a8d96753d3d0cd0c2"><code>1db0c88</code></a>
Specify python version for pre (<a
href="https://redirect.github.com/jupyter/nbconvert/issues/2276">#2276</a>)</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/7473fc3037a6317bff54380e3a7162d73bf089b3"><code>7473fc3</code></a>
chore: update pre-commit hooks (<a
href="https://redirect.github.com/jupyter/nbconvert/issues/2242">#2242</a>)</li>
<li><a
href="https://github.com/jupyter/nbconvert/commit/4322f7f290694929f414cefadc942111afad3762"><code>4322f7f</code></a>
Bump the actions group across 1 directory with 2 updates (<a
href="https://redirect.github.com/jupyter/nbconvert/issues/2273">#2273</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/jupyter/nbconvert/compare/v7.17.0...v7.17.1">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=nbconvert&package-manager=pip&previous-version=7.17.0&new-version=7.17.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/NVIDIA/cuda-quantum/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index ce1dbf89a46..8fc90efbddd 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -16,7 +16,7 @@ pytest-xdist==3.8.0
 psutil
 numpy==1.26.4
 notebook==7.5.2
-nbconvert==7.17.0
+nbconvert==7.17.1
 llvmlite==0.44.0
 scipy==1.16.3
 requests==2.33.1

From aa90e117e3f79a5f9f11b433a492124cf89d5789 Mon Sep 17 00:00:00 2001
From: Andres Paz <anpaz@cs.washington.edu>
Date: Tue, 21 Apr 2026 21:41:52 +0000
Subject: [PATCH 34/85] Fix sampling with mixed-basis measurements (mx/my)
 (#4336)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #4333

### Root Cause

`mx(q)` is implemented as `h(q); mz(q)` — a basis-change gate followed
by a Z-basis measurement. The problem is that in sampling mode every
gate application calls `flushAnySamplingTasks()`, which samples all
previously recorded qubits and **clears the measurement buffer**
(`sampleQubits`). So the intrinsic `h` gate inside `mx` prematurely
flushes measurements recorded before it, and only the qubits measured
after the flush end up in the final result.

For example, in a kernel that does `mz(q[0]); mx(q[1])`:
1. `mz(q[0])` records qubit 0 in `sampleQubits`
2. `mx(q[1])` calls `h(q[1])`, which triggers `flushAnySamplingTasks()`
— this samples qubit 0 and clears `sampleQubits`
3. `mz(q[1])` (inside `mx`) records qubit 1 in `sampleQubits`
4. At finalization, only qubit 1 is in `sampleQubits`, producing 1-bit
results instead of 2-bit

### Fix

Gate operations now only trigger a sampling flush when they operate on a
qubit that has already been measured (i.e., already present in the
sampling buffer). Previously, every gate unconditionally called
`flushAnySamplingTasks()`, which caused basis-change gates from
`mx`/`my` decomposition (`h(q)`, `s†(q)`) to prematurely sample only the
qubits measured so far, truncating bitstrings.

With this change, `h(q)` from the `mx(q)` → `h(q); mz(q)` decomposition
no longer triggers a flush because `q` hasn't been measured yet. A flush
still occurs when a gate operates on an already-measured qubit,
preserving mid-circuit measurement semantics (measure, apply gate,
re-measure).

### Testing

Added regression tests (C++ and Python) that verify mixed-basis
measurement kernels return correctly-sized bitstrings with proper qubit
ordering.

---------

Signed-off-by: Andres Paz <andresp@nvidia.com>
---
 .../kernel/test_explicit_measurements.py      | 63 ++++++++++++++---
 runtime/nvqir/CircuitSimulator.h              | 22 +++++-
 runtime/nvqir/stim/StimCircuitSimulator.cpp   | 27 ++++++-
 targettests/execution/mixed_basis_sample.cpp  | 70 +++++++++++++++++++
 .../integration/measure_reset_tester.cpp      | 67 ++++++++++++++++++
 5 files changed, 234 insertions(+), 15 deletions(-)
 create mode 100644 targettests/execution/mixed_basis_sample.cpp

diff --git a/python/tests/kernel/test_explicit_measurements.py b/python/tests/kernel/test_explicit_measurements.py
index 2c69837c080..3b3b216fdf4 100644
--- a/python/tests/kernel/test_explicit_measurements.py
+++ b/python/tests/kernel/test_explicit_measurements.py
@@ -195,21 +195,62 @@ def no_measure_ops():
     assert "not supported on a kernel without any measurement" in repr(e)
 
 
-def test_mx_my():
+def test_mixed_basis_measurement_order_and_preservation():
 
     @cudaq.kernel
-    def my_kernel():
-        q = cudaq.qvector(2)
-        h(q[0])
-        x.ctrl(q[0], q[1])
-        mx(q[0])
-        my(q[1])
+    def mixed_basis_kernel():
+        q = cudaq.qvector(9)
+
+        # Prepare a non-palindromic deterministic pattern over measured bits.
+        # q0=0 (mz), q1=1 (mz), q2=1 (mx), q3=? (my), q4=0 (mz), q5=0 (mx),
+        # q6=1 (mz) -> 011?001 in allocation order.
+        x(q[1])
+        x(q[2])
+        h(q[2])
+        h(q[5])
+        x(q[6])
+
+        # Mix measurement bases and execution order.
+        mz(q[4])
+        mx(q[2])
+        my(q[3])
+        mz(q[0])
+        mx(q[5])
+        mz(q[6])
+        mz(q[1])
 
-    counts = cudaq.sample(my_kernel)
-    assert len(counts) == 2
+    counts = cudaq.sample(mixed_basis_kernel, shots_count=100)
 
-    counts = cudaq.sample(my_kernel, explicit_measurements=True)
-    assert len(counts) == 4
+    total_counts = 0
+    for bits in counts:
+        assert len(bits) == 7
+        assert bits[0] == '0'
+        assert bits[1] == '1'
+        assert bits[2] == '1'
+        assert bits[4] == '0'
+        assert bits[5] == '0'
+        assert bits[6] == '1'
+        total_counts += counts[bits]
+
+    assert total_counts == 100
+
+    counts = cudaq.sample(mixed_basis_kernel,
+                          explicit_measurements=True,
+                          shots_count=100)
+
+    # Execution order was q4, q2, q3, q0, q5, q6, q1 => 01?0011.
+    total_counts = 0
+    for bits in counts:
+        assert len(bits) == 7
+        assert bits[0] == '0'
+        assert bits[1] == '1'
+        assert bits[3] == '0'
+        assert bits[4] == '0'
+        assert bits[5] == '1'
+        assert bits[6] == '1'
+        total_counts += counts[bits]
+
+    assert total_counts == 100
 
 
 # NOTE: Ref - https://github.com/NVIDIA/cuda-quantum/issues/1925
diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
index 224a3988f4d..1673910fc8b 100644
--- a/runtime/nvqir/CircuitSimulator.h
+++ b/runtime/nvqir/CircuitSimulator.h
@@ -643,6 +643,18 @@ class CircuitSimulatorBase : public CircuitSimulator {
                              "subclasses, override addQubitsToState.");
   }
 
+  /// @brief Check if any of the given qubit indices have already been
+  /// recorded for sampling. Used to decide whether a gate application
+  /// should trigger a sampling flush (only needed when the gate operates
+  /// on a qubit that was already measured, i.e. mid-circuit measurement).
+  bool operatesOnMeasuredQubit(const std::vector<std::size_t> &qubits) const {
+    for (auto q : qubits)
+      if (std::find(sampleQubits.begin(), sampleQubits.end(), q) !=
+          sampleQubits.end())
+        return true;
+    return false;
+  }
+
   /// @brief Execute a sampling task with the current set of sample qubits.
   void flushAnySamplingTasks(bool force = false) {
     auto executionContext = cudaq::getExecutionContext();
@@ -1088,7 +1100,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
                             const std::vector<std::size_t> &controls,
                             const std::vector<std::size_t> &targets,
                             const std::string_view customName) override {
-    flushAnySamplingTasks();
+    if (operatesOnMeasuredQubit(controls) || operatesOnMeasuredQubit(targets))
+      flushAnySamplingTasks();
     auto numRows = std::sqrt(matrix.size());
     auto numQubits = std::log2(numRows);
     std::vector<std::complex<ScalarType>> actual;
@@ -1139,7 +1152,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
   void enqueueQuantumOperation(const std::vector<ScalarType> &angles,
                                const std::vector<std::size_t> &controls,
                                const std::vector<std::size_t> &targets) {
-    flushAnySamplingTasks();
+    if (operatesOnMeasuredQubit(controls) || operatesOnMeasuredQubit(targets))
+      flushAnySamplingTasks();
     QuantumOperation gate;
     CUDAQ_INFO(gateToString(gate.name(), controls, angles, targets));
     enqueueGate(gate.name(), gate.getGate(angles), controls, targets, angles);
@@ -1229,7 +1243,9 @@ class CircuitSimulatorBase : public CircuitSimulator {
   /// @brief Invoke a general multi-control swap gate
   void swap(const std::vector<std::size_t> &ctrlBits, const std::size_t srcIdx,
             const std::size_t tgtIdx) override {
-    flushAnySamplingTasks();
+    if (operatesOnMeasuredQubit(ctrlBits) ||
+        operatesOnMeasuredQubit({srcIdx, tgtIdx}))
+      flushAnySamplingTasks();
     CUDAQ_INFO(gateToString("swap", ctrlBits, {}, {srcIdx, tgtIdx}));
     std::vector<std::complex<ScalarType>> matrix{
         {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
diff --git a/runtime/nvqir/stim/StimCircuitSimulator.cpp b/runtime/nvqir/stim/StimCircuitSimulator.cpp
index b62b137cfe6..6871154cb3c 100644
--- a/runtime/nvqir/stim/StimCircuitSimulator.cpp
+++ b/runtime/nvqir/stim/StimCircuitSimulator.cpp
@@ -9,6 +9,7 @@
 #include "common/FmtCore.h"
 #include "nvqir/CircuitSimulator.h"
 #include "stim.h"
+#include <cmath>
 #include <numeric>
 
 using namespace cudaq;
@@ -400,6 +401,11 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
            paulis.find(gateName[1]) != std::string::npos;
   }
 
+  static bool isApproxAngle(double value, double target) {
+    constexpr double tolerance = 1e-12;
+    return std::abs(value - target) < tolerance;
+  }
+
   void applyGate(const GateApplicationTask &task) override {
     std::string gateName(task.operationName);
     std::transform(gateName.begin(), gateName.end(), gateName.begin(),
@@ -430,7 +436,26 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
           fmt::format("Gate not supported by Stim simulator: {}. Note that "
                       "Stim can only simulate Clifford gates.",
                       task.operationName));
-    else if (gateName == "SDG")
+    else if (gateName == "R1") {
+      if (task.parameters.size() != 1)
+        throw std::runtime_error(
+            fmt::format("Gate not supported by Stim simulator: {}. Note that "
+                        "Stim can only simulate Clifford gates.",
+                        task.operationName));
+
+      auto angle = task.parameters.front();
+      if (isApproxAngle(angle, M_PI_2))
+        gateName = "S";
+      else if (isApproxAngle(angle, -M_PI_2))
+        gateName = "S_DAG";
+      else if (isApproxAngle(angle, M_PI) || isApproxAngle(angle, -M_PI))
+        gateName = "Z";
+      else
+        throw std::runtime_error(
+            fmt::format("Gate not supported by Stim simulator: {}({}). Note "
+                        "that Stim can only simulate Clifford gates.",
+                        task.operationName, angle));
+    } else if (gateName == "SDG")
       gateName = "S_DAG";
     else if (gateName == "ID")
       gateName = "I";
diff --git a/targettests/execution/mixed_basis_sample.cpp b/targettests/execution/mixed_basis_sample.cpp
new file mode 100644
index 00000000000..e8af1a59aa3
--- /dev/null
+++ b/targettests/execution/mixed_basis_sample.cpp
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Regression test for https://github.com/NVIDIA/cuda-quantum/issues/4333
+// Verify that mixing mz/mx/my in a sampled kernel produces bitstrings that
+// include all measured qubits.
+
+// RUN: nvq++ --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ --library-mode %s -o %t && %t | FileCheck %s
+
+#include <cassert>
+#include <cudaq.h>
+#include <cstdio>
+
+// Kernel that measures 7 out of 9 qubits using mixed bases.
+// Deterministic bits: q0=0(mz), q1=1(mz), q4=0(mz), q6=1(mz).
+// mx on |+> (q2) and |-> (q5) are deterministic: q2=0, q5=1.
+// my on |0> (q3) is non-deterministic.
+__qpu__ void mixed_basis_kernel() {
+  cudaq::qvector q(9);
+
+  x(q[1]);
+  h(q[2]); // |+> so mx deterministically gives 0
+  x(q[5]);
+  h(q[5]); // |-> so mx deterministically gives 1
+  x(q[6]);
+
+  mz(q[4]);
+  mx(q[2]);
+  my(q[3]);
+  mz(q[0]);
+  mx(q[5]);
+  mz(q[6]);
+  mz(q[1]);
+}
+
+int main() {
+  auto counts = cudaq::sample(100, mixed_basis_kernel);
+
+  // Must have at least one bitstring.
+  assert(counts.size() > 0);
+
+  for (auto &[bits, count] : counts) {
+    // Every bitstring must be 7 bits (7 measured qubits).
+    assert(bits.size() == 7);
+
+    // Deterministic z-basis results.
+    assert(bits[0] == '0'); // q0 mz -> 0
+    assert(bits[1] == '1'); // q1 mz -> 1
+    assert(bits[4] == '0'); // q4 mz -> 0
+    assert(bits[5] == '1'); // q5 mx(|->) -> 1
+    assert(bits[6] == '1'); // q6 mz -> 1
+
+    // q2: mx(|+>) -> 0
+    assert(bits[2] == '0');
+
+    // q3: my(|0>) is non-deterministic, just check valid bit.
+    assert(bits[3] == '0' || bits[3] == '1');
+  }
+
+  printf("passed\n");
+  return 0;
+}
+
+// CHECK: passed
diff --git a/unittests/integration/measure_reset_tester.cpp b/unittests/integration/measure_reset_tester.cpp
index 93673fd60c4..e9a29e21740 100644
--- a/unittests/integration/measure_reset_tester.cpp
+++ b/unittests/integration/measure_reset_tester.cpp
@@ -82,3 +82,70 @@ TEST(MeasureResetTester, checkLibModeOrdering) {
   counts.dump();
   EXPECT_EQ("10", counts.begin()->first);
 }
+
+TEST(MeasureResetTester, checkMixedBasisOrderingAndPreservation) {
+  constexpr std::size_t shots = 100;
+
+  auto kernel = []() __qpu__ {
+    cudaq::qvector q(7);
+
+    // Prepare a non-palindromic deterministic pattern over measured bits.
+    // q0=0 (mz), q1=1 (mz), q2=1 (mx), q3=? (my), q4=0 (mz), q5=0 (mx),
+    // q6=1 (mz) -> 011?001 in allocation order.
+    x(q[1]);
+    x(q[2]);
+    h(q[2]);
+    h(q[5]);
+    x(q[6]);
+
+    // Mix measurement bases and execution order.
+    mz(q[4]);
+    mx(q[2]);
+    my(q[3]);
+    mz(q[0]);
+    mx(q[5]);
+    mz(q[6]);
+    mz(q[1]);
+  };
+
+  auto counts = cudaq::sample(shots, kernel);
+  std::size_t totalCounts = 0;
+  for (const auto &[bits, count] : counts) {
+    if (bits.size() != 7u) {
+      ADD_FAILURE() << "Expected 7-bit string in default mode, got '" << bits
+                    << "'";
+      continue;
+    }
+    EXPECT_EQ(bits[0], '0');
+    EXPECT_EQ(bits[1], '1');
+    EXPECT_EQ(bits[2], '1');
+    EXPECT_EQ(bits[4], '0');
+    EXPECT_EQ(bits[5], '0');
+    EXPECT_EQ(bits[6], '1');
+    totalCounts += count;
+  }
+  EXPECT_EQ(totalCounts, shots);
+
+  cudaq::sample_options options{};
+  options.shots = shots;
+  options.explicit_measurements = true;
+  counts = cudaq::sample(options, kernel);
+  totalCounts = 0;
+
+  // Execution order was q4, q2, q3, q0, q5, q6, q1 => 01?0011.
+  for (const auto &[bits, count] : counts) {
+    if (bits.size() != 7u) {
+      ADD_FAILURE() << "Expected 7-bit string in explicit mode, got '" << bits
+                    << "'";
+      continue;
+    }
+    EXPECT_EQ(bits[0], '0');
+    EXPECT_EQ(bits[1], '1');
+    EXPECT_EQ(bits[3], '0');
+    EXPECT_EQ(bits[4], '0');
+    EXPECT_EQ(bits[5], '1');
+    EXPECT_EQ(bits[6], '1');
+    totalCounts += count;
+  }
+  EXPECT_EQ(totalCounts, shots);
+}

From ccbf2a8ac9bb3a595b22862cb5ef6360576d5822 Mon Sep 17 00:00:00 2001
From: Renaud Kauffmann <rkauffmann@nvidia.com>
Date: Tue, 21 Apr 2026 16:41:37 -0700
Subject: [PATCH 35/85] Introducing execution policies. Starting with providing
 results as return values. (#4339)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the initial PR in a series aimed at reducing the use of the
ExecutionContext and ultimately providing kernel launch overloads based
on execution policy. The end goal is to arrive at a launch API on the
QPU side that would look like as follows:
```cpp
sample_result QPU::launchKernel(cudaq::sample_policy,  const std::string &kernelName, CompiledModule cm);
observe_result QPU::launchKernel(cudaq::observe_policy, const std::string &kernelName, CompiledModule cm);
...
```
Whereby, each overload returns its own result, instead of populating an
overextended ExecutionContext.

This one PR focuses on getting the `CircuitSimulator` and the
`ExecutionManager` family of classes to return an actual value instead
of populating the ExecutionContext. Because of the incremental nature of
the work the class hierarchies are untouched and the ExecutionContext is
still populated but this step can easily be eliminated when the whole
flow comes together.

## 1. API Changes
The  API of `CircuitSimulator` and `ExecutionManager` changes from
```cpp
virtual void measureSpinOp(const cudaq::spin_op &op);
virtual void finalizeExecutionContext(cudaq::ExecutionContext &ctx);
```
to
```cpp
virtual cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op);
void finalizeExecutionContext(cudaq::ExecutionContext &ctx);
virtual cudaq::sample_result finalizeExecutionContext(const cudaq::sample_policy &policy, cudaq::ExecutionContext &ctx);
virtual void                            finalizeExecutionContext(const cudaq::other_policies &policy,       cudaq::ExecutionContext &ctx);
```

- `measureSpinOp` now returns a value.

- The virtual `finalizeExecutionContext` becomes non-virtual and
dispatches to policy-based virtual overloads.

- The `cudaq::sample_policy` overload returns a `cudaq::sample_result`.

- There is a fallback overload `cudaq::other_policies`, for all other
policies. This is useful for policies that do not require any particular
specialization. It is easy to add overloads as needed. It would not
require any dispatching changes.

## 2. Internal changes
`CircuitSimulator` and `ExecutionManager` used to rely on the
ExecutionContext to store intermediate simulation results. This is no
longer the case. Intermediate results are now stored internally in
`internalResults`. This is also reflected in the API of `measureSpinOp`
which gets its value from internals.

## 3. Dispatching
1. Runtime dispatching.
A runtime dispatching is implemented using a utility provided by the
policies. We can see below that the dispatcher catches the results from
the overload and updates the `ExecutionContext`. When the flow is
complete, this step will be eliminated and the overload will be called
directly.

```cpp
void ExecutionManager::finalizeExecutionContext(ExecutionContext &ctx) {
  policies::withPolicy(ctx.name, [&](auto policy) {
    policies::visitResult(
        [&]() { return finalizeExecutionContext(policy, ctx); },
        [&](sample_result &&r) { ctx.result = std::move(r); },  // <---- Updates ctx!
        [&](policies::void_result &&r) {});
  });
```
2. Customization Point Objects
To provide a fallback mechanism in the many cases when an overload does
not exist for a given policy,
CPOs are introduced in `policy_cpos.h`. They provide a default
implementation for policy overloads that are absent.
These can be discovered through ADL with hidden friends in the policies
as illustrated in this PR.

## 4. Policies

### File location
Policies were added under `runtime/cudaq/algorithm` in a directory
corresponding to the algorithm.
Common files (`any_policy.h`, `policies.h` and `policy_dispatch.h`) are
under `runtime/cudaq/algorithm`
```
runtime/cudaq/algorithm
                    ├── any_policy.h
                     ...
                    ├── observe
                    │   ├── options.h
                    │   └── policy.h
                    ├── observe.h
                    ...
                    ├── policies.h
                    ├── policy_dispatch.h
                    ├── resource_estimation.h
                    ├── run
                    ├── run.cpp
                    ├── run.h
                    ├── sample
                    │   ├── options.h
                    │   └── policy.h
                    ├── sample.h

```

### Design

Policies are small objects, largely used as tags for dispatching.

- They provide hidden friends for ADL lookup of policy overloads.
- Although not currently used, they are also meant to carry launch
options.

```cpp
struct sample_policy : public any_policy {
  using result_type = sample_result;
  sample_options options;

  friend sample_result
  finalize_execution_manager_impl(ExecutionManager &mgr,
                                  const sample_policy &policy,
                                  ExecutionContext &ctx);
  friend sample_result
  finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
                                   const sample_policy &policy,
                                   ExecutionContext &ctx);
};
```

## 5. Future work

1. Consolidate all launch apis into one.
There are currently 3 entry points in the QPU, that should be
consolidated into one in order to avoid a sprawl of overloads
```cpp
  [[nodiscard]] virtual KernelThunkResultType
  launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args,
               std::uint64_t, std::uint64_t,
               const std::vector<void *> &rawArgs) = 0;

  [[nodiscard]] virtual KernelThunkResultType
  launchModule(const std::string &name, mlir::ModuleOp module,
               const std::vector<void *> &rawArgs);

  [[nodiscard]] virtual CompiledModule
  specializeModule(const std::string &name, mlir::ModuleOp module,
                   const std::vector<void *> &rawArgs, bool isEntryPoint);
```

2. Manage ExecutionManager locally
The unified `launchKernel` is still being designed but it might look
like something below. The result would be extracted from the
ExecutionManager upon leaving the launch.
```cpp
sample_result QPU::launchKernel(cudaq::sample_policy, const std::string &kernelName, CompiledModule cm) {
  ExecutionManager em(getCurrentContext());
  cm.launch(kernelName);
  return em.finalize();
}
```

4. Split the JIT compiler with overloads as well.

```cpp
  std::vector<cudaq::KernelExecution>
  lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                 const std::string &kernelName, void *kernelArgs,
                 const std::vector<void *> &rawArgs);
```
```cpp
  std::vector<cudaq::KernelExecution>
  lowerQuakeCode(cudaq::sample_policy, CompiledModule);
```
5. Implement all policies
Currently, only the sample_policy and observe_policy are implemented.
This has been enough for this initial work.

6. Bring it all together

To tie it all together, we will be using Architecture's executor_api
proposal.
This will allow us to register a closure within the ExecutionContext to
carry return value from QPU launch to user launch.
```cpp
ctx->execute_kernel =
        [qpu_ptr](const char *kernelName, KernelThunkType kernel, void *args,
                  std::uint64_t argsSize, std::uint64_t resultOffset,
                  const std::vector<void *> &rawArgs) {
          return qpu_ptr->launch_kernel(kernelName, kernel, args, argsSize,
                                        resultOffset, rawArgs);
        };

```

---------

Signed-off-by: Renaud Kauffmann <rkauffmann@nvidia.com>
---
 runtime/cudaq/algorithms/observe.h            |  16 +-
 runtime/cudaq/algorithms/observe/options.h    |  30 +++
 runtime/cudaq/algorithms/observe/policy.h     |  25 +++
 runtime/cudaq/algorithms/policies.h           |  19 ++
 runtime/cudaq/algorithms/policy_cpos.h        | 107 ++++++++++
 runtime/cudaq/algorithms/policy_dispatch.h    | 195 ++++++++++++++++++
 runtime/cudaq/algorithms/sample.h             |  16 +-
 runtime/cudaq/algorithms/sample/options.h     |  29 +++
 runtime/cudaq/algorithms/sample/policy.h      |  41 ++++
 runtime/cudaq/qis/execution_manager.cpp       |  24 ++-
 runtime/cudaq/qis/execution_manager.h         |  18 +-
 .../qis/managers/BasicExecutionManager.h      |  21 +-
 .../default/DefaultExecutionManager.cpp       |  19 +-
 .../photonics/PhotonicsExecutionManager.cpp   |  64 +++---
 runtime/nvqir/CircuitSimulator.h              | 185 +++++++++++------
 runtime/nvqir/NVQIR.cpp                       |  80 -------
 .../nvqir/cudensitymat/CuDensityMatSim.cpp    |  16 +-
 unittests/backends/QPPTester.h                |   5 +-
 unittests/qir/NVQIRTester.cpp                 |   1 -
 .../SimpleQuditExecutionManager.cpp           |  52 ++---
 20 files changed, 711 insertions(+), 252 deletions(-)
 create mode 100644 runtime/cudaq/algorithms/observe/options.h
 create mode 100644 runtime/cudaq/algorithms/observe/policy.h
 create mode 100644 runtime/cudaq/algorithms/policies.h
 create mode 100644 runtime/cudaq/algorithms/policy_cpos.h
 create mode 100644 runtime/cudaq/algorithms/policy_dispatch.h
 create mode 100644 runtime/cudaq/algorithms/sample/options.h
 create mode 100644 runtime/cudaq/algorithms/sample/policy.h

diff --git a/runtime/cudaq/algorithms/observe.h b/runtime/cudaq/algorithms/observe.h
index c768548517a..d9fdf8c3f18 100644
--- a/runtime/cudaq/algorithms/observe.h
+++ b/runtime/cudaq/algorithms/observe.h
@@ -11,6 +11,7 @@
 #include "common/ExecutionContext.h"
 #include "common/ObserveResult.h"
 #include "cudaq/algorithms/broadcast.h"
+#include "cudaq/algorithms/observe/policy.h"
 #include "cudaq/concepts.h"
 #include "cudaq/host_config.h"
 #include "cudaq/operators.h"
@@ -49,21 +50,6 @@ concept ObserveCallValid =
     ValidArgumentsPassed<QuantumKernel, Args...> &&
     HasVoidReturnType<std::invoke_result_t<QuantumKernel, Args...>>;
 
-/// @brief Observe options to provide as an argument to the `observe()`,
-/// `async_observe()` functions.
-/// @param shots number of shots to run for the given kernel, or -1 if not
-/// applicable.
-/// @param noise noise model to use for the sample operation
-/// @param num_trajectories is the optional number of trajectories to be used
-/// when computing the expectation values in the presence of noise. This
-/// parameter is only applied to simulation backends that support noisy
-/// simulation of trajectories.
-struct observe_options {
-  int shots = -1;
-  cudaq::noise_model noise;
-  std::optional<std::size_t> num_trajectories;
-};
-
 namespace details {
 
 /// @brief Take the input KernelFunctor (a lambda that captures runtime
diff --git a/runtime/cudaq/algorithms/observe/options.h b/runtime/cudaq/algorithms/observe/options.h
new file mode 100644
index 00000000000..4002ad65bf6
--- /dev/null
+++ b/runtime/cudaq/algorithms/observe/options.h
@@ -0,0 +1,30 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include "common/NoiseModel.h"
+#include <cstddef>
+#include <optional>
+
+namespace cudaq {
+
+/// @brief Observe options to provide as an argument to the `observe()`,
+/// `async_observe()` functions.
+/// @param shots number of shots to run for the given kernel, or -1 if not
+/// applicable.
+/// @param noise noise model to use for the sample operation
+/// @param num_trajectories is the optional number of trajectories to be used
+/// when computing the expectation values in the presence of noise. This
+/// parameter is only applied to simulation backends that support noisy
+/// simulation of trajectories.
+struct observe_options {
+  int shots = -1;
+  cudaq::noise_model noise;
+  std::optional<std::size_t> num_trajectories;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/observe/policy.h b/runtime/cudaq/algorithms/observe/policy.h
new file mode 100644
index 00000000000..2c4af9abb9f
--- /dev/null
+++ b/runtime/cudaq/algorithms/observe/policy.h
@@ -0,0 +1,25 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/ObserveResult.h"
+#include "cudaq/algorithms/observe/options.h"
+
+namespace cudaq {
+
+/// @brief Tag and options for computing expectation values.
+struct observe_policy {
+  /// Associated result type for synchronous APIs keyed off this policy.
+  using result_type = observe_result;
+
+  /// Observe options.
+  observe_options options;
+};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/policies.h b/runtime/cudaq/algorithms/policies.h
new file mode 100644
index 00000000000..08b3b2446bb
--- /dev/null
+++ b/runtime/cudaq/algorithms/policies.h
@@ -0,0 +1,19 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/algorithms/observe/policy.h"
+#include "cudaq/algorithms/sample/policy.h"
+
+namespace cudaq {
+
+/// @brief Fallback policy tag used when no specific policy matches.
+struct other_policies {};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/policy_cpos.h b/runtime/cudaq/algorithms/policy_cpos.h
new file mode 100644
index 00000000000..0efb02b0e94
--- /dev/null
+++ b/runtime/cudaq/algorithms/policy_cpos.h
@@ -0,0 +1,107 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+/// @file policy_cpos.h
+/// @brief Customization-point objects (CPOs) for policy-based dispatching.
+///
+/// Each CPO wraps a named customization point (e.g. @c finalize). To opt in
+/// for a policy @c P, declare a hidden-friend overload of the @c finalize
+/// function inside the policy struct; the CPO discovers it via ADL.
+///
+/// When no policy-specific overload is found, the CPO falls back to a
+/// default implementation that bypasses the policy entirely.
+
+#pragma once
+
+namespace nvqir {
+class CircuitSimulator;
+}
+
+namespace cudaq {
+class ExecutionManager;
+class ExecutionContext;
+
+/// @brief Default finalization — called when no policy-specific overload
+///        exists. Defined in execution_manager.h.
+void finalize_execution_manager_impl(cudaq::ExecutionManager &mgr,
+                                     cudaq::ExecutionContext &ctx);
+
+namespace detail {
+
+/// @brief Detects whether a policy-specific @c finalize_execution_manager_impl
+///        overload exists for type @p T (found via ADL through hidden friends).
+template <class T>
+concept has_em_custom_finalize =
+    requires(cudaq::ExecutionManager &mgr, const T &policy,
+             cudaq::ExecutionContext &ctx) {
+      finalize_execution_manager_impl(mgr, policy, ctx);
+    };
+
+/// @brief CPO function object for ExecutionManager finalization.
+///
+/// Dispatches to a policy-specific @c finalize_execution_manager_impl if one
+/// exists, otherwise falls back to the 2-argument default.
+struct finalize_execution_manager_fn {
+  template <class Policy>
+  decltype(auto) operator()(cudaq::ExecutionManager &mgr, const Policy &policy,
+                            cudaq::ExecutionContext &ctx) const {
+    if constexpr (has_em_custom_finalize<Policy>) {
+      return finalize_execution_manager_impl(mgr, policy, ctx);
+    } else {
+      return finalize_execution_manager_impl(mgr, ctx);
+    }
+  }
+};
+
+} // namespace detail
+
+/// @brief CPO: finalize an execution context via the ExecutionManager.
+inline constexpr detail::finalize_execution_manager_fn
+    finalize_execution_manager{};
+
+} // namespace cudaq
+
+namespace nvqir {
+
+/// @brief Default finalization — called when no policy-specific overload
+///        exists. Defined in CircuitSimulator.h.
+void finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
+                                      cudaq::ExecutionContext &ctx);
+
+namespace detail {
+
+/// @brief Detects whether a policy-specific
+///        @c finalize_simulation_circuit_impl overload exists for type @p T.
+template <class T>
+concept has_sim_custom_finalize =
+    requires(nvqir::CircuitSimulator &sim, const T &policy,
+             cudaq::ExecutionContext &ctx) {
+      finalize_simulation_circuit_impl(sim, policy, ctx);
+    };
+
+/// @brief CPO function object for CircuitSimulator finalization.
+///
+/// Dispatches to a policy-specific @c finalize_simulation_circuit_impl if one
+/// exists, otherwise falls back to the 2-argument default.
+struct finalize_simulation_circuit_fn {
+  template <class Policy>
+  decltype(auto) operator()(nvqir::CircuitSimulator &sim, const Policy &policy,
+                            cudaq::ExecutionContext &ctx) const {
+    if constexpr (has_sim_custom_finalize<Policy>) {
+      return finalize_simulation_circuit_impl(sim, policy, ctx);
+    } else {
+      return finalize_simulation_circuit_impl(sim, ctx);
+    }
+  }
+};
+} // namespace detail
+
+/// @brief CPO: finalize an execution context via the CircuitSimulator.
+inline constexpr detail::finalize_simulation_circuit_fn
+    finalize_simulation_circuit{};
+} // namespace nvqir
diff --git a/runtime/cudaq/algorithms/policy_dispatch.h b/runtime/cudaq/algorithms/policy_dispatch.h
new file mode 100644
index 00000000000..1eaa988e88b
--- /dev/null
+++ b/runtime/cudaq/algorithms/policy_dispatch.h
@@ -0,0 +1,195 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/algorithms/policies.h"
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+namespace cudaq::policies {
+
+/// @brief Run-time policy dispatch utilities for name-based overload selection.
+///
+/// Typical usage:
+/// @code
+/// // Simple case: all overloads return void or the same type
+/// withPolicy(context.name, [&](auto policy) {
+///     beginExecutionContext(policy, context);
+/// });
+///
+/// // Heterogeneous return types: use withPolicy + visitResult
+/// withPolicy(context.name, [&](auto policy) {
+///   visitResult(
+///     [&]{ return finalizeExecutionContext(policy, context); },
+///     [&](sample_result&&  r) { context.sample_data  = std::move(r.data); },
+///     [&](run_result&&     r) { context.exit_code    = r.exit_code; },
+///     [&](void_result)        { context.result       = {}; }
+///   );
+/// });
+/// @endcode
+
+// =============================================================================
+// withPolicy
+// =============================================================================
+
+/// @brief Maps a runtime name to a compile-time policy and invokes a generic
+/// callable.
+///
+/// Performs a linear search over a static registry of known policy names.
+/// On a match, the callable is invoked with the corresponding concrete policy
+/// type, enabling compile-time overload resolution. If no match is found, the
+/// callable is invoked with @c other_policies{} as a fallback.
+///
+/// Overload resolution follows standard C++ rules: the most derived policy type
+/// is preferred. If no specific overload exists for a given policy, the
+/// compiler uses the @c other_policies fallback.
+///
+/// @note A linear search over a static array is used rather than a hash map.
+///       For ~10 short string keys, all entries fit in a few cache lines,
+///       making linear search faster than the hash computation and pointer
+///       indirection of @c std::unordered_map. Place the most frequently used
+///       policies first in the registry for best average performance.
+///
+/// @tparam Func  A generic callable type, typically a lambda taking @c auto
+/// policy.
+/// @param  name  The runtime name used to select a policy (e.g. @c
+/// context.name).
+/// @param  func  A generic callable invoked with the resolved policy tag.
+///
+/// @par Example
+/// @code
+/// withPolicy(context.name, [&](auto policy) {
+///     beginExecutionContext(policy, context);
+/// });
+/// @endcode
+template <typename Func>
+void withPolicy(std::string_view name, Func &&func) {
+  using FuncRef = std::remove_reference_t<Func> &;
+  using Entry = std::pair<std::string_view, void (*)(FuncRef)>;
+
+  // One static array per Func instantiation — initialized once, no heap
+  // allocation. To add a new policy, append an entry here and define the policy
+  // struct above.
+  static const Entry registry[] = {
+      {"sample", [](FuncRef f) { f(sample_policy{}); }},
+      {"observe", [](FuncRef f) { f(observe_policy{}); }},
+  };
+
+  for (auto &[key, dispatch] : registry) {
+    if (name == key) {
+      dispatch(func);
+      return;
+    }
+  }
+
+  func(other_policies{});
+}
+
+// =============================================================================
+// visitResult
+// =============================================================================
+
+/// @brief Tag type passed to a visitor when the invoked function returns @c
+/// void.
+///
+/// Allows clients to handle the void case explicitly alongside non-void cases
+/// in a uniform visitor pattern, rather than requiring a separate code path.
+///
+/// @par Example
+/// @code
+/// visitResult(
+///     [&]{ return doSomething(policy, context); },
+///     [&](some_result&& r) { ... },
+///     [&](void_result)     { ... }  // called when doSomething returns void
+/// );
+/// @endcode
+struct void_result {};
+
+namespace detail {
+
+/// @brief Merges multiple callables into a single visitor that dispatches on
+/// argument type.
+///
+/// Inherits @c operator() from each provided type, allowing the compiler to
+/// select the most specific overload based on the argument type at the call
+/// site.
+///
+/// @tparam Ts  Callable types (typically lambdas) to merge into the visitor.
+template <typename... Ts>
+struct overloaded : Ts... {
+  using Ts::operator()...;
+};
+
+template <typename... Ts>
+overloaded(Ts...) -> overloaded<Ts...>;
+
+/// @brief Invokes @p func and forwards its result to @p visitor.
+///
+/// If @p func returns @c void, @c void_result{} is forwarded to @p visitor
+/// instead, ensuring a uniform visitor interface regardless of the return type.
+///
+/// @tparam Visitor  A callable accepting either the return type of @p func or
+/// @c void_result.
+/// @tparam Func     A zero-argument callable whose return type determines the
+/// dispatch.
+/// @param  visitor  The visitor to invoke with the result.
+/// @param  func     The callable to invoke.
+template <typename Visitor, typename Func>
+void invokeVisitor(Visitor &&visitor, Func &&func) {
+  if constexpr (std::is_void_v<std::invoke_result_t<Func>>) {
+    std::forward<Func>(func)();
+    std::forward<Visitor>(visitor)(void_result{});
+  } else {
+    std::forward<Visitor>(visitor)(std::forward<Func>(func)());
+  }
+}
+
+} // namespace detail
+
+/// @brief Invokes a callable and dispatches its return value to a matching
+/// handler.
+///
+/// Builds a visitor from @p handlers using the @c overloaded idiom, then
+/// invokes
+/// @p func and forwards its return value to the matching handler. If @p func
+/// returns @c void, a @c void_result tag is forwarded instead, allowing the
+/// client to handle that case explicitly.
+///
+/// Intended to be used inside a @c withPolicy call to handle heterogeneous
+/// return types from policy-dispatched functions.
+///
+/// @tparam Func      A zero-argument callable, typically a lambda capturing
+///                   the policy and context (e.g. @c [&]{ return f(policy,
+///                   context); }).
+/// @tparam Handlers  Callable types handling each possible return type.
+///                   One handler should accept @c void_result to cover
+///                   void-returning overloads.
+/// @param  func      The callable to invoke (called exactly once).
+/// @param  handlers  Lambdas handling each concrete return type.
+///
+/// @par Example
+/// @code
+/// withPolicy(context.name, [&](auto policy) {
+///   visitResult(
+///     [&]{ return finalizeExecutionContext(policy, context); },
+///     [&](sample_result&&  r) { context.sample_data  = std::move(r.data); },
+///     [&](run_result&&     r) { context.exit_code    = r.exit_code; },
+///     [&](observe_result&& r) { context.observations =
+///     std::move(r.observations); },
+///     [&](void_result)        { context.result       = {}; }
+///   );
+/// });
+/// @endcode
+template <typename Func, typename... Handlers>
+void visitResult(Func &&func, Handlers &&...handlers) {
+  detail::invokeVisitor(detail::overloaded{std::forward<Handlers>(handlers)...},
+                        std::forward<Func>(func));
+}
+} // namespace cudaq::policies
diff --git a/runtime/cudaq/algorithms/sample.h b/runtime/cudaq/algorithms/sample.h
index 99297ab2cac..942b182ed8e 100644
--- a/runtime/cudaq/algorithms/sample.h
+++ b/runtime/cudaq/algorithms/sample.h
@@ -11,11 +11,11 @@
 #include "common/ExecutionContext.h"
 #include "common/SampleResult.h"
 #include "cudaq/algorithms/broadcast.h"
+#include "cudaq/algorithms/sample/options.h"
+#include "cudaq/algorithms/sample/policy.h"
 #include "cudaq/concepts.h"
 #include "cudaq/host_config.h"
 
-constexpr int DEFAULT_NUM_SHOTS = 1000;
-
 namespace cudaq {
 bool kernelHasConditionalFeedback(const std::string &);
 namespace detail {
@@ -229,18 +229,6 @@ auto runSamplingAsync(KernelFunctor &&wrappedKernel, quantum_platform &platform,
 }
 } // namespace details
 
-/// @brief Sample options to provide to the sample() / async_sample() functions
-///
-/// @param shots number of shots to run for the given kernel
-/// @param noise noise model to use for the sample operation
-/// @param explicit_measurements whether or not to form the global register
-/// based on user-supplied measurement order.
-struct sample_options {
-  std::size_t shots = DEFAULT_NUM_SHOTS;
-  cudaq::noise_model noise;
-  bool explicit_measurements = false;
-};
-
 /// @overload
 /// @brief Sample the given quantum kernel expression and return the
 /// mapping of observed bit strings to corresponding number of
diff --git a/runtime/cudaq/algorithms/sample/options.h b/runtime/cudaq/algorithms/sample/options.h
new file mode 100644
index 00000000000..18efdd8fb6e
--- /dev/null
+++ b/runtime/cudaq/algorithms/sample/options.h
@@ -0,0 +1,29 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/NoiseModel.h"
+#include <cstddef>
+
+constexpr int DEFAULT_NUM_SHOTS = 1000;
+namespace cudaq {
+
+/// @brief Sample options to provide to the sample() / async_sample() functions
+///
+/// @param shots number of shots to run for the given kernel
+/// @param noise noise model to use for the sample operation
+/// @param explicit_measurements whether or not to form the global register
+/// based on user-supplied measurement order.
+struct sample_options {
+  std::size_t shots = DEFAULT_NUM_SHOTS;
+  cudaq::noise_model noise;
+  bool explicit_measurements = false;
+};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/algorithms/sample/policy.h b/runtime/cudaq/algorithms/sample/policy.h
new file mode 100644
index 00000000000..1f6f8ebf4d6
--- /dev/null
+++ b/runtime/cudaq/algorithms/sample/policy.h
@@ -0,0 +1,41 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/SampleResult.h"
+#include "cudaq/algorithms/sample/options.h"
+
+namespace nvqir {
+class CircuitSimulator;
+}
+
+namespace cudaq {
+
+class ExecutionManager;
+class ExecutionContext;
+
+/// @brief Tag and options for sampling quantum circuit measurements.
+struct sample_policy {
+  /// Associated result type for synchronous APIs keyed off this policy.
+  using result_type = sample_result;
+
+  /// Sampling  options.
+  sample_options options;
+
+  friend sample_result
+  finalize_execution_manager_impl(ExecutionManager &mgr,
+                                  const sample_policy &policy,
+                                  ExecutionContext &ctx);
+  friend sample_result
+  finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
+                                   const sample_policy &policy,
+                                   ExecutionContext &ctx);
+};
+
+} // namespace cudaq
diff --git a/runtime/cudaq/qis/execution_manager.cpp b/runtime/cudaq/qis/execution_manager.cpp
index 72627d6be0b..3d5335db87c 100644
--- a/runtime/cudaq/qis/execution_manager.cpp
+++ b/runtime/cudaq/qis/execution_manager.cpp
@@ -9,27 +9,39 @@
 #include "execution_manager.h"
 #include "common/ExecutionContext.h"
 #include "common/PluginUtils.h"
+#include "cudaq/algorithms/policy_cpos.h"
+#include "cudaq/algorithms/policy_dispatch.h"
+
+using namespace cudaq;
 
-namespace cudaq {
 static ExecutionManager *execution_manager;
 
-void setExecutionManagerInternal(ExecutionManager *em) {
+void cudaq::setExecutionManagerInternal(ExecutionManager *em) {
   CUDAQ_INFO("external caller setting the execution manager.");
   execution_manager = em;
 }
 
-void resetExecutionManagerInternal() {
+void cudaq::resetExecutionManagerInternal() {
   CUDAQ_INFO("external caller clearing the execution manager.");
   execution_manager = nullptr;
 }
 
-ExecutionManager *getExecutionManagerInternal() { return execution_manager; }
+ExecutionManager *cudaq::getExecutionManagerInternal() {
+  return execution_manager;
+}
 
-ExecutionManager *detail::getExecutionManagerFromContext() {
+ExecutionManager *cudaq::detail::getExecutionManagerFromContext() {
   auto ctx = getExecutionContext();
   if (ctx)
     return ctx->executionManager;
   return nullptr;
 }
 
-} // namespace cudaq
+void ExecutionManager::finalizeExecutionContext(ExecutionContext &ctx) {
+  policies::withPolicy(ctx.name, [&](auto policy) {
+    policies::visitResult(
+        [&]() { return cudaq::finalize_execution_manager(*this, policy, ctx); },
+        [&](sample_result &&r) { ctx.result = std::move(r); },
+        [&](policies::void_result &&r) {});
+  });
+}
diff --git a/runtime/cudaq/qis/execution_manager.h b/runtime/cudaq/qis/execution_manager.h
index 585496aca89..96b5f2f215c 100644
--- a/runtime/cudaq/qis/execution_manager.h
+++ b/runtime/cudaq/qis/execution_manager.h
@@ -12,6 +12,7 @@
 #include "common/NoiseModel.h"
 #include "common/QuditIdTracker.h"
 #include "common/SampleResult.h"
+#include "cudaq/algorithms/policies.h"
 #include "cudaq/host_config.h"
 #include "cudaq/operators.h"
 #include <deque>
@@ -114,7 +115,12 @@ class ExecutionManager {
   virtual void configureExecutionContext(ExecutionContext &ctx) {}
 
   /// Finalize the execution context after an execution.
-  virtual void finalizeExecutionContext(ExecutionContext &ctx) {}
+  void finalizeExecutionContext(ExecutionContext &ctx);
+
+  virtual void finalizeExecutionContext(const other_policies &policy,
+                                        ExecutionContext &ctx) {}
+  virtual sample_result finalizeExecutionContext(const sample_policy &policy,
+                                                 ExecutionContext &ctx) = 0;
 
   /// Set up the execution manager for a new execution.
   virtual void beginExecution() {}
@@ -199,6 +205,16 @@ class ExecutionManager {
   virtual ~ExecutionManager() = default;
 };
 
+inline sample_result finalize_execution_manager_impl(
+    ExecutionManager &mgr, const sample_policy &policy, ExecutionContext &ctx) {
+  return mgr.finalizeExecutionContext(policy, ctx);
+}
+
+inline void finalize_execution_manager_impl(ExecutionManager &mgr,
+                                            ExecutionContext &ctx) {
+  mgr.finalizeExecutionContext(other_policies{}, ctx);
+}
+
 // Function declaration, implemented by the macro expansion below
 ExecutionManager *getRegisteredExecutionManager();
 
diff --git a/runtime/cudaq/qis/managers/BasicExecutionManager.h b/runtime/cudaq/qis/managers/BasicExecutionManager.h
index f8c57149850..229a47c143c 100644
--- a/runtime/cudaq/qis/managers/BasicExecutionManager.h
+++ b/runtime/cudaq/qis/managers/BasicExecutionManager.h
@@ -76,11 +76,18 @@ class BasicExecutionManager : public cudaq::ExecutionManager {
 
   /// @brief Measure the state in the respective basis described each term in
   /// the given `spin_op`.
-  virtual void measureSpinOp(const cudaq::spin_op &op) = 0;
+  virtual cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) = 0;
 
   /// @brief Subtype-specific method for performing qudit reset.
   virtual void resetQudit(const QuditInfo &q) = 0;
 
+  void finalizeExecutionContextImpl(ExecutionContext &ctx) {
+    assert(&ctx == cudaq::getExecutionContext() &&
+           "cannot finalize non-current execution context");
+    ScopedTraceWithContext("BasicExecutionManager::finalizeExecutionContext");
+    synchronize();
+  }
+
 public:
   BasicExecutionManager() = default;
   virtual ~BasicExecutionManager() = default;
@@ -90,13 +97,6 @@ class BasicExecutionManager : public cudaq::ExecutionManager {
     instructionQueue.clear();
   }
 
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    assert(&ctx == cudaq::getExecutionContext() &&
-           "cannot finalize non-current execution context");
-    ScopedTraceWithContext("BasicExecutionManager::finalizeExecutionContext");
-    synchronize();
-  }
-
   void endExecution() override {
     deallocateQudits(contextQuditIdsForDeletion);
     for (auto &q : contextQuditIdsForDeletion)
@@ -267,10 +267,7 @@ class BasicExecutionManager : public cudaq::ExecutionManager {
 
   cudaq::SpinMeasureResult measure(const cudaq::spin_op &op) override {
     synchronize();
-    measureSpinOp(op);
-    auto executionContext = cudaq::getExecutionContext();
-    return std::make_pair(executionContext->expectationValue.value(),
-                          executionContext->result);
+    return measureSpinOp(op);
   }
 
   void reset(const QuditInfo &target) override {
diff --git a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
index 82654c593d8..7a30cfd7d3e 100644
--- a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
+++ b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
@@ -152,8 +152,8 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
     simulator()->configureExecutionContext(ctx);
   }
 
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    BasicExecutionManager::finalizeExecutionContext(ctx);
+  void finalizeExecutionContextImpl(ExecutionContext &ctx) {
+    BasicExecutionManager::finalizeExecutionContextImpl(ctx);
 
     if (!requestedAllocations.empty()) {
       CUDAQ_INFO("[DefaultExecutionManager] Flushing remaining {} allocations "
@@ -165,6 +165,17 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
       simulator()->allocateQubits(requestedAllocations.size());
       requestedAllocations.clear();
     }
+  }
+
+  sample_result finalizeExecutionContext(const sample_policy &policy,
+                                         ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
+    return simulator()->finalizeExecutionContext(policy, ctx);
+  }
+
+  void finalizeExecutionContext(const other_policies &policy,
+                                ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
     simulator()->finalizeExecutionContext(ctx);
   }
 
@@ -277,9 +288,9 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
     simulator()->flushGateQueue();
   }
 
-  void measureSpinOp(const cudaq::spin_op &op) override {
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) override {
     flushRequestedAllocations();
-    simulator()->measureSpinOp(op);
+    return simulator()->measureSpinOp(op);
   }
 
 public:
diff --git a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
index 8e273d3cca3..664ca50397a 100644
--- a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
+++ b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
@@ -6,6 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 #include "common/FmtCore.h"
+#include "common/SampleResult.h"
 #include "cudaq/operators.h"
 #include "cudaq/qis/managers/BasicExecutionManager.h"
 #include "cudaq/runtime/logger/logger.h"
@@ -165,33 +166,46 @@ class PhotonicsExecutionManager : public cudaq::BasicExecutionManager {
   }
 
   /// @brief Process results into the execution context
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    BasicExecutionManager::finalizeExecutionContext(ctx);
-
-    std::vector<std::size_t> ids;
+  void finalizeExecutionContextImpl(std::vector<std::size_t> &ids,
+                                    ExecutionContext &ctx) {
+    BasicExecutionManager::finalizeExecutionContextImpl(ctx);
     for (auto &s : sampleQudits) {
       ids.push_back(s.id);
     }
-    if (ctx.name == "sample") {
-      CUDAQ_INFO("Sampling");
-      auto shots = ctx.shots;
-      auto sampleResult =
-          qpp::sample(shots, state, ids, sampleQudits.begin()->levels);
-      cudaq::ExecutionResult counts;
-      for (auto [result, count] : sampleResult) {
-        std::stringstream bitstring;
-        for (const auto &quditRes : result) {
-          bitstring << quditRes;
-        }
-        // Add to the sample result
-        // in mid-circ sampling mode this will append 1 bitstring
-        counts.appendResult(bitstring.str(), count);
-        // Reset the string.
-        bitstring.str("");
-        bitstring.clear();
+  }
+
+  sample_result finalizeExecutionContext(const sample_policy &policy,
+                                         ExecutionContext &ctx) override {
+    std::vector<std::size_t> ids;
+    finalizeExecutionContextImpl(ids, ctx);
+    CUDAQ_INFO("Sampling");
+    auto shots = ctx.shots;
+    auto sampleResult =
+        qpp::sample(shots, state, ids, sampleQudits.begin()->levels);
+    cudaq::ExecutionResult counts;
+    for (auto [result, count] : sampleResult) {
+      std::stringstream bitstring;
+      for (const auto &quditRes : result) {
+        bitstring << quditRes;
       }
-      ctx.result.append(counts);
-    } else if (ctx.name == "extract-state") {
+      // Add to the sample result
+      // in mid-circ sampling mode this will append 1 bitstring
+      counts.appendResult(bitstring.str(), count);
+      // Reset the string.
+      bitstring.str("");
+      bitstring.clear();
+    }
+    sample_result result;
+    result.append(counts);
+    return result;
+  }
+
+  void finalizeExecutionContext(const other_policies &policy,
+                                ExecutionContext &ctx) override {
+    std::vector<std::size_t> ids;
+    finalizeExecutionContextImpl(ids, ctx);
+
+    if (ctx.name == "extract-state") {
       CUDAQ_INFO("Extracting state");
       // If here, then we care about the result qudit, so compute it.
       for (auto &q : sampleQudits) {
@@ -253,7 +267,9 @@ class PhotonicsExecutionManager : public cudaq::BasicExecutionManager {
   }
 
   /// @brief Measure the state in the basis described by the given `spin_op`.
-  void measureSpinOp(const cudaq::spin_op &) override {}
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &) override {
+    return cudaq::SpinMeasureResult(0.0, {});
+  }
 
   /// @brief Method for performing qudit reset.
   void resetQudit(const cudaq::QuditInfo &id) override {}
diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
index 1673910fc8b..f03eca403c2 100644
--- a/runtime/nvqir/CircuitSimulator.h
+++ b/runtime/nvqir/CircuitSimulator.h
@@ -15,6 +15,9 @@
 #include "common/QuditIdTracker.h"
 #include "common/SampleResult.h"
 #include "common/Timing.h"
+#include "cudaq/algorithms/policies.h"
+#include "cudaq/algorithms/policy_cpos.h"
+#include "cudaq/algorithms/policy_dispatch.h"
 #include "cudaq/host_config.h"
 #include "cudaq/runtime/logger/logger.h"
 #include <concepts>
@@ -103,6 +106,9 @@ class CircuitSimulator {
   /// sample() function.
   bool supportsBufferedSample = false;
 
+  /// @brief Internal result
+  cudaq::sample_result internalResult = {};
+
 public:
   /// @brief The constructor
   CircuitSimulator() = default;
@@ -248,7 +254,20 @@ class CircuitSimulator {
   virtual void deallocateQubits(const std::vector<std::size_t> &qubits) = 0;
 
   /// @brief Process the results stored in the given execution context.
-  virtual void finalizeExecutionContext(cudaq::ExecutionContext &context) = 0;
+  void finalizeExecutionContext(cudaq::ExecutionContext &ctx) {
+    cudaq::policies::withPolicy(ctx.name, [&](auto policy) {
+      cudaq::policies::visitResult(
+          [&]() { return finalize_simulation_circuit(*this, policy, ctx); },
+          [&](cudaq::sample_result &&r) { ctx.result = std::move(r); },
+          [&](cudaq::policies::void_result &&r) {});
+    });
+  }
+
+  virtual void finalizeExecutionContext(const cudaq::other_policies &policy,
+                                        cudaq::ExecutionContext &ctx) {}
+  virtual cudaq::sample_result
+  finalizeExecutionContext(const cudaq::sample_policy &policy,
+                           cudaq::ExecutionContext &ctx) = 0;
 
   /// @brief Clean up after execution ends.
   virtual void endExecution() {}
@@ -382,7 +401,7 @@ class CircuitSimulator {
   virtual bool mz(const std::size_t qubitIdx,
                   const std::string &registerName) = 0;
 
-  virtual void measureSpinOp(const cudaq::spin_op &op) = 0;
+  virtual cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) = 0;
 
   /// @brief Set the current state to the |0> state,
   /// retaining the current number of qubits.
@@ -671,7 +690,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
       // OK, now we're ready to grab the buffered sample results for the entire
       // execution context.
       auto execResult = sample(sampleQubits, nShots);
-      executionContext->result.append(execResult);
+      internalResult.append(execResult);
       return;
     }
 
@@ -709,8 +728,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
 
     if (registerNameToMeasuredQubit.empty()) {
-      executionContext->result.append(execResult,
-                                      executionContext->explicitMeasurements);
+      internalResult.append(execResult, executionContext->explicitMeasurements);
     } else {
 
       for (auto &[regName, qubits] : registerNameToMeasuredQubit) {
@@ -738,7 +756,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
           tmp.appendResult(b, count);
         }
 
-        executionContext->result.append(tmp);
+        internalResult.append(tmp);
       }
     }
 
@@ -973,72 +991,82 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
   }
 
+protected:
   /// @brief Reset the current execution context.
-  void finalizeExecutionContext(cudaq::ExecutionContext &context) override {
-    if (nQubitsAllocated == 0 && context.name != "sample")
-      return;
-
+  void finalizeExecutionContextImpl(cudaq::ExecutionContext &context) {
     // Get the ExecutionContext name
     auto execContextName = context.name;
 
     // Flush the queue if there are any gates to apply
     flushGateQueue();
+  }
 
-    // If we are sampling...
-    if (execContextName == "sample") {
-      // Sample the state over the specified number of shots
-      if (sampleQubits.empty() && !context.explicitMeasurements) {
-        sampleQubits.resize(getNumQubits());
-        if (sampleQubits.empty())
-          throw std::runtime_error(
-              "Sampling detected on a kernel with no qubits. Your kernel must "
-              "have qubits to sample it.");
-        std::iota(sampleQubits.begin(), sampleQubits.end(), 0);
-      }
+  cudaq::sample_result
+  finalizeExecutionContext(const cudaq::sample_policy &policy,
+                           cudaq::ExecutionContext &context) override {
+    finalizeExecutionContextImpl(context);
 
-      // Flush any queued up sampling tasks
-      flushAnySamplingTasks(/*force this*/ true);
-
-      // Handle the processing for any mid circuit measurements
-      for (auto &m : midCircuitSampleResults) {
-        // Get the register name and the vector of bit results
-        auto regName = m.first;
-        auto bitResults = m.second;
-        cudaq::ExecutionResult counts(regName);
-
-        if (std::find(vectorRegisters.begin(), vectorRegisters.end(),
-                      regName) != vectorRegisters.end()) {
-          // this is a vector register
-          std::string bitStr = "";
-          for (std::size_t j = 0; j < bitResults.size(); j++)
-            bitStr += bitResults[j];
-
-          counts.appendResult(bitStr, 1);
-
-        } else {
-          // Not a vector, collate all bits into a 1 qubit counts dict
-          for (std::size_t j = 0; j < bitResults.size(); j++) {
-            counts.appendResult(bitResults[j], 1);
-          }
-        }
-        context.result.append(counts);
-      }
+    // Sample the state over the specified number of shots
+    if (sampleQubits.empty() && !context.explicitMeasurements) {
+      sampleQubits.resize(getNumQubits());
+      if (sampleQubits.empty())
+        throw std::runtime_error(
+            "Sampling detected on a kernel with no qubits. Your kernel must "
+            "have qubits to sample it.");
+      std::iota(sampleQubits.begin(), sampleQubits.end(), 0);
+    }
+
+    // Flush any queued up sampling tasks
+    flushAnySamplingTasks(/*force this*/ true);
 
-      // Reorder the global register (if necessary). This might be necessary if
-      // the mapping pass had run and we want to undo the shuffle that occurred
-      // during mapping.
-      if (!context.reorderIdx.empty()) {
-        context.result.reorder(context.reorderIdx);
-        context.reorderIdx.clear();
+    // Handle the processing for any mid circuit measurements
+    for (auto &m : midCircuitSampleResults) {
+      // Get the register name and the vector of bit results
+      auto regName = m.first;
+      auto bitResults = m.second;
+      cudaq::ExecutionResult counts(regName);
+
+      if (std::find(vectorRegisters.begin(), vectorRegisters.end(), regName) !=
+          vectorRegisters.end()) {
+        // this is a vector register
+        std::string bitStr = "";
+        for (std::size_t j = 0; j < bitResults.size(); j++)
+          bitStr += bitResults[j];
+
+        counts.appendResult(bitStr, 1);
+
+      } else {
+        // Not a vector, collate all bits into a 1 qubit counts dict
+        for (std::size_t j = 0; j < bitResults.size(); j++) {
+          counts.appendResult(bitResults[j], 1);
+        }
       }
+      internalResult.append(counts);
+    }
 
-      // Clear the sample bits for the next run
-      sampleQubits.clear();
-      midCircuitSampleResults.clear();
-      lastMidCircuitRegisterName = "";
-      currentCircuitName = "";
+    // Reorder the global register (if necessary). This might be necessary if
+    // the mapping pass had run and we want to undo the shuffle that occurred
+    // during mapping.
+    if (!context.reorderIdx.empty()) {
+      internalResult.reorder(context.reorderIdx);
+      context.reorderIdx.clear();
     }
 
+    // Clear the sample bits for the next run
+    sampleQubits.clear();
+    midCircuitSampleResults.clear();
+    lastMidCircuitRegisterName = "";
+    currentCircuitName = "";
+
+    return internalResult;
+  }
+
+  void finalizeExecutionContext(const cudaq::other_policies &policy,
+                                cudaq::ExecutionContext &context) override {
+    if (nQubitsAllocated == 0)
+      return;
+    finalizeExecutionContextImpl(context);
+
     // Set the state data if requested.
     if (context.name == "extract-state") {
       context.simulationState = getSimulationState();
@@ -1055,6 +1083,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
   }
 
+public:
   /// @brief Clean up state after execution ends
   void endExecution() override {
     if (nQubitsAllocated == 0) {
@@ -1078,6 +1107,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     }
 
     tracker = {};
+    internalResult = {};
   }
 
   /// @brief Set the execution context
@@ -1304,22 +1334,18 @@ class CircuitSimulatorBase : public CircuitSimulator {
   // FIXME: it would be cleaner and more consistent (with exp_pauli) if
   // this function explicitly received a vector of qubit indices such that
   // only the relative order of the target in the spin op is relevant.
-  void measureSpinOp(const cudaq::spin_op &op) override {
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &op) override {
     auto executionContext = cudaq::getExecutionContext();
 
     if (nQubitsAllocated == 0) {
-      if (executionContext)
-        executionContext->expectationValue = 0.0;
-      return;
+      return cudaq::SpinMeasureResult(0.0, {});
     }
 
     flushGateQueue();
 
     if (executionContext->canHandleObserve) {
       auto result = observe(executionContext->spin.value());
-      executionContext->expectationValue = result.expectation();
-      executionContext->result = result.raw_data();
-      return;
+      return cudaq::SpinMeasureResult(result.expectation(), result.raw_data());
     }
 
     if (op.num_terms() != 1)
@@ -1364,8 +1390,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
 
     // Sample and give the data to the context
     cudaq::ExecutionResult result = sample(qubitsToMeasure, shots);
-    executionContext->expectationValue = result.expectationValue;
-    executionContext->result = cudaq::sample_result(result);
+    cudaq::SpinMeasureResult spinMeasureResult(
+        result.expectationValue.value_or(0.0), result);
 
     // Restore the state.
     if (!basisChange.empty()) {
@@ -1375,6 +1401,8 @@ class CircuitSimulatorBase : public CircuitSimulator {
 
       flushGateQueue();
     }
+
+    return spinMeasureResult;
   }
 
 private:
@@ -1417,6 +1445,27 @@ class CircuitSimulatorBase : public CircuitSimulator {
     return qubits;
   }
 };
+
+} // namespace nvqir
+
+namespace cudaq {
+
+inline sample_result
+finalize_simulation_circuit_impl(nvqir::CircuitSimulator &sim,
+                                 const sample_policy &policy,
+                                 ExecutionContext &ctx) {
+  return sim.finalizeExecutionContext(policy, ctx);
+}
+
+} // namespace cudaq
+
+namespace nvqir {
+
+inline void finalize_simulation_circuit_impl(CircuitSimulator &sim,
+                                             cudaq::ExecutionContext &ctx) {
+  sim.finalizeExecutionContext(cudaq::other_policies{}, ctx);
+}
+
 } // namespace nvqir
 
 #define CONCAT(a, b) CONCAT_INNER(a, b)
diff --git a/runtime/nvqir/NVQIR.cpp b/runtime/nvqir/NVQIR.cpp
index ed48279ea23..be1931b67c1 100644
--- a/runtime/nvqir/NVQIR.cpp
+++ b/runtime/nvqir/NVQIR.cpp
@@ -1048,86 +1048,6 @@ static std::vector<Pauli> extractPauliTermIds(Array *paulis) {
   return pauliIds;
 }
 
-/// @brief QIR function measuring the qubit state in the given Pauli basis.
-/// @param pauli_arr
-/// @param qubits
-/// @return
-Result *__quantum__qis__measure__body(Array *pauli_arr, Array *qubits) {
-  CUDAQ_INFO("NVQIR measuring in pauli basis");
-  ScopedTraceWithContext("NVQIR::observe_measure_body");
-
-  auto *circuitSimulator = nvqir::getCircuitSimulatorInternal();
-  auto *currentContext = cudaq::getExecutionContext();
-
-  // Some backends may better handle the observe task.
-  // Let's give them that opportunity.
-  if (currentContext->canHandleObserve) {
-    circuitSimulator->flushGateQueue();
-    auto result = circuitSimulator->observe(currentContext->spin.value());
-    currentContext->expectationValue = result.expectation();
-    currentContext->result = result.raw_data();
-    return ResultZero;
-  }
-
-  const auto paulis = extractPauliTermIds(pauli_arr);
-  std::vector<std::size_t> qubits_to_measure;
-  std::vector<std::pair<std::string, std::size_t>> reverser;
-  for (size_t i = 0; i < paulis.size(); ++i) {
-    const auto pauli = paulis[i];
-    switch (pauli) {
-    case Pauli::Pauli_I:
-      break;
-    case Pauli::Pauli_X: {
-
-      circuitSimulator->h(i);
-      qubits_to_measure.push_back(i);
-      reverser.push_back({"X", i});
-      break;
-    }
-    case Pauli::Pauli_Y: {
-      double angle = M_PI_2;
-      circuitSimulator->rx(angle, i);
-      qubits_to_measure.push_back(i);
-      reverser.push_back({"Y", i});
-
-      break;
-    }
-    case Pauli::Pauli_Z: {
-      qubits_to_measure.push_back(i);
-      break;
-    }
-    }
-  }
-
-  circuitSimulator->flushGateQueue();
-  int shots = 0;
-  if (currentContext->shots > 0) {
-    shots = currentContext->shots;
-  }
-
-  // Sample and give the data to the context
-  cudaq::ExecutionResult result =
-      circuitSimulator->sample(qubits_to_measure, shots);
-  currentContext->expectationValue = result.expectationValue;
-  currentContext->result = cudaq::sample_result(result);
-
-  // Reverse the measurements bases change.
-  if (!reverser.empty()) {
-    CUDAQ_INFO("NVQIR reverse pauli bases change for measurement.");
-    for (auto it = reverser.rbegin(); it != reverser.rend(); ++it) {
-      if (it->first == "X") {
-        circuitSimulator->h(it->second);
-      } else if (it->first == "Y") {
-        double angle = -M_PI_2;
-        circuitSimulator->rx(angle, it->second);
-      }
-    }
-    circuitSimulator->flushGateQueue();
-  }
-
-  return ResultZero;
-}
-
 /// @brief Implementation of first order trotterization
 /// enables exp( i * angle * H), where H = Sum (PauliTensorProduct)
 /// @param paulis
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
index 71be0cbc1c7..c1d310d9f74 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
@@ -133,7 +133,8 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
     return std::make_unique<cudaq::CuDensityMatState>()->createFromData(data);
   }
 
-  void finalizeExecutionContext(cudaq::ExecutionContext &context) override {
+protected:
+  void finalizeExecutionContextImpl(cudaq::ExecutionContext &context) {
     // Just check that the dynamics target was not invoked in gate simulation
     // contexts.
     if (context.name != "evolve")
@@ -142,6 +143,19 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
           context.name));
   }
 
+  cudaq::sample_result
+  finalizeExecutionContext(const cudaq::sample_policy &policy,
+                           cudaq::ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
+    return cudaq::sample_result();
+  }
+
+  void finalizeExecutionContext(const cudaq::other_policies &policy,
+                                cudaq::ExecutionContext &ctx) override {
+    finalizeExecutionContextImpl(ctx);
+  }
+
+public:
   void addQubitToState() override {
     throw std::runtime_error(
         "[dynamics target] Quantum gate simulation is not supported.");
diff --git a/unittests/backends/QPPTester.h b/unittests/backends/QPPTester.h
index 31a2f2c3083..ab8a0ee86ea 100644
--- a/unittests/backends/QPPTester.h
+++ b/unittests/backends/QPPTester.h
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "common/ExecutionContext.h"
+#include "cudaq/algorithms/sample/policy.h"
 #include <string>
 #include <vector>
 
@@ -20,10 +21,10 @@ class QppCircuitSimulatorTester : public Simulator {
     // a quick set-reset to trigger sampling
     this->configureExecutionContext(ctx);
     cudaq::detail::setExecutionContext(&ctx);
-    this->finalizeExecutionContext(ctx);
+    auto sampleResults =
+        this->finalizeExecutionContext(cudaq::sample_policy{}, ctx);
     cudaq::detail::resetExecutionContext();
 
-    auto sampleResults = ctx.result;
     return sampleResults.begin()->first;
   }
 
diff --git a/unittests/qir/NVQIRTester.cpp b/unittests/qir/NVQIRTester.cpp
index 3ebaefb7733..169e5aa1d98 100644
--- a/unittests/qir/NVQIRTester.cpp
+++ b/unittests/qir/NVQIRTester.cpp
@@ -59,7 +59,6 @@ void __quantum__qis__rz(double x, Qubit *q);
 void __quantum__qis__rz__ctl(double x, Array *ctrls, Qubit *q);
 void __quantum__qis__u3(double theta, double phi, double lambda, Qubit *q);
 Result *__quantum__qis__mz(Qubit *q);
-Result *__quantum__qis__measure__body(Array *basis, Array *qubits);
 Result *__quantum__rt__result_get_one();
 Result *__quantum__rt__result_get_zero();
 void __quantum__qis__exp__body(Array *paulis, double angle, Array *qubits);
diff --git a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
index 0aad0cd3332..be711841570 100644
--- a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
+++ b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
@@ -8,6 +8,7 @@
 
 #include "common/ExecutionContext.h"
 #include "common/FmtCore.h"
+#include "common/SampleResult.h"
 #include "cudaq/runtime/logger/logger.h"
 
 #include "cudaq/operators.h"
@@ -57,30 +58,31 @@ class SimpleQuditExecutionManager : public cudaq::BasicExecutionManager {
   void deallocateQudit(const cudaq::QuditInfo &q) override {}
   void deallocateQudits(const std::vector<cudaq::QuditInfo> &qudits) override {}
 
-  void finalizeExecutionContext(ExecutionContext &ctx) override {
-    BasicExecutionManager::finalizeExecutionContext(ctx);
-
-    if (ctx.name == "sample") {
-      std::vector<std::size_t> ids;
-      for (auto &s : sampleQudits) {
-        ids.push_back(s.id);
-      }
-      auto sampleResult =
-          qpp::sample(ctx.shots, state, ids, sampleQudits.begin()->levels);
-
-      ExecutionResult execResult;
-      for (auto [result, count] : sampleResult) {
-        std::cout << fmt::format("Sample {} : {}", result, count) << "\n";
-        // Populate counts dictionary. FIXME - handle qudits with >= 10 levels
-        // better.
-        std::string resultStr;
-        resultStr.reserve(result.size());
-        for (auto x : result)
-          resultStr += std::to_string(x);
-        execResult.counts[resultStr] = count;
-      }
-      ctx.result.append(execResult);
+  sample_result finalizeExecutionContext(const sample_policy &policy,
+                                         ExecutionContext &ctx) override {
+    BasicExecutionManager::finalizeExecutionContextImpl(ctx);
+
+    std::vector<std::size_t> ids;
+    for (auto &s : sampleQudits) {
+      ids.push_back(s.id);
+    }
+    auto sampleResult =
+        qpp::sample(ctx.shots, state, ids, sampleQudits.begin()->levels);
+
+    ExecutionResult execResult;
+    for (auto [result, count] : sampleResult) {
+      std::cout << fmt::format("Sample {} : {}", result, count) << "\n";
+      // Populate counts dictionary. FIXME - handle qudits with >= 10 levels
+      // better.
+      std::string resultStr;
+      resultStr.reserve(result.size());
+      for (auto x : result)
+        resultStr += std::to_string(x);
+      execResult.counts[resultStr] = count;
     }
+    sample_result result;
+    result.append(execResult);
+    return result;
   }
 
   void endExecution() override {
@@ -115,7 +117,9 @@ class SimpleQuditExecutionManager : public cudaq::BasicExecutionManager {
     return measurement_result;
   }
 
-  void measureSpinOp(const cudaq::spin_op &) override {}
+  cudaq::SpinMeasureResult measureSpinOp(const cudaq::spin_op &) override {
+    return cudaq::SpinMeasureResult(0.0, {});
+  }
 
 public:
   SimpleQuditExecutionManager() {

From 7ab08ec4343c00165e6d8d0f84877c6e3396faeb Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Wed, 22 Apr 2026 05:29:50 +0000
Subject: [PATCH 36/85] fix: EADDRINUSE error in CI due to port collision with
 HorizonServerHelper port

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 unittests/backends/qbraid/CMakeLists.txt                 | 2 +-
 unittests/backends/qbraid/QbraidStartServerAndTest.sh.in | 2 +-
 unittests/backends/qbraid/QbraidTester.cpp               | 8 ++++----
 utils/mock_qpu/qbraid/__init__.py                        | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/unittests/backends/qbraid/CMakeLists.txt b/unittests/backends/qbraid/CMakeLists.txt
index e088f6546de..e1e7a1c07e7 100644
--- a/unittests/backends/qbraid/CMakeLists.txt
+++ b/unittests/backends/qbraid/CMakeLists.txt
@@ -9,7 +9,7 @@
 add_backend_unittest_executable(test_qbraid
   SOURCES QbraidTester.cpp
   BACKEND qbraid
-  BACKEND_CONFIG "qbraid emulate=false url=http://localhost:62452 api_key=00000000000000000000000000000000"
+  BACKEND_CONFIG "qbraid emulate=false url=http://localhost:62454 api_key=00000000000000000000000000000000"
 )
 
 configure_file("QbraidStartServerAndTest.sh.in" "${CMAKE_BINARY_DIR}/unittests/backends/qbraid/QbraidStartServerAndTest.sh" @ONLY)
diff --git a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
index 3510a3077f1..4221dc109fa 100644
--- a/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
+++ b/unittests/backends/qbraid/QbraidStartServerAndTest.sh.in
@@ -13,7 +13,7 @@ checkServerConnection() {
 import socket
 try:
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.connect(("localhost", 62452))
+    s.connect(("localhost", 62454))
     s.close()
 except Exception:
     exit(1)
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index 37e99325139..bf5d4128fe2 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -15,7 +15,7 @@
 #include <stdlib.h>
 
 // Update the backend string to match the QBraid format
-std::string mockPort = "62452";
+std::string mockPort = "62454";
 std::string backendStringTemplate =
     "qbraid;emulate;false;url;http://localhost:{}";
 
@@ -161,7 +161,7 @@ CUDAQ_TEST(QbraidTester, checkJobFailure) {
   cudaq::RestClient client;
   nlohmann::json body = nlohmann::json::object();
   std::map<std::string, std::string> headers;
-  auto armed = client.post("http://localhost:62452/", "test/fail_next", body,
+  auto armed = client.post("http://localhost:62454/", "test/fail_next", body,
                            headers, /*enableLogging=*/false);
   ASSERT_TRUE(armed.value("armed", false));
 
@@ -180,7 +180,7 @@ CUDAQ_TEST(QbraidTester, checkResultRetry) {
   nlohmann::json body = nlohmann::json::object();
   std::map<std::string, std::string> headers;
   auto armed =
-      client.post("http://localhost:62452/", "test/delay_next_results/2", body,
+      client.post("http://localhost:62454/", "test/delay_next_results/2", body,
                   headers, /*enableLogging=*/false);
   ASSERT_EQ(armed.value("remaining", -1), 2);
 
@@ -200,7 +200,7 @@ CUDAQ_TEST(QbraidTester, checkResultRetryExhaustion) {
   nlohmann::json body = nlohmann::json::object();
   std::map<std::string, std::string> headers;
   auto armed =
-      client.post("http://localhost:62452/", "test/delay_next_results/10", body,
+      client.post("http://localhost:62454/", "test/delay_next_results/10", body,
                   headers, /*enableLogging=*/false);
   ASSERT_EQ(armed.value("remaining", -1), 10);
 
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
index 20a559fff7c..b6f33926af0 100644
--- a/utils/mock_qpu/qbraid/__init__.py
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -315,4 +315,4 @@ def startServer(port):
 
 
 if __name__ == "__main__":
-    startServer(62452)
+    startServer(62454)

From 0299a12fdf0e7f5318f2e88bd9d5b91c330c80c2 Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Wed, 22 Apr 2026 08:04:39 +0200
Subject: [PATCH 37/85] Slim down CompiledModule, move out execution logic
 (#4356)

This is the second follow up after #4322 from our discussion last week.

This PR 'dumbs down' the `CompiledModule` type in the following sense
- it does not make any assumptions anymore on what the compilation
artifacts contain (e.g. before `JitArtifact` kept track of the various
named entrypoints according to some naming convention)
- it does not know what 'executing' a `CompiledModule` means

The result is a pretty clean 'container' type that just knows how to
store artifacts of different kinds. The only methods that still rely on
some naming conventions are `getArgsCreator` and `getReturnOffset`,
which exist purely for convenience (can be removed if you wish).

This logic (which is dependent on conventions), now lives outside of the
type. The naming conventions and entrypoint resolution was fully moved
to construction (in
`runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h`).
The execution logic on the other hand was moved to the only placed it is
currently used in (`runtime/cudaq/platform/qpu.cpp`). We might need to
move this further as we unify kernel execution for C++ and Python, but
wasn't sure where it would eventually land (and in what form).

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   3 +-
 runtime/common/CompiledModule.cpp             | 123 ++++++------------
 runtime/common/CompiledModule.h               | 120 ++++++++---------
 runtime/cudaq/platform/qpu.cpp                |  41 +++++-
 .../compiler/CompiledModuleHelper.cpp         |  80 +++++-------
 runtime/internal/compiler/Compiler.cpp        |  33 ++---
 .../compiler/CompiledModuleHelper.h           |  43 +++---
 7 files changed, 205 insertions(+), 238 deletions(-)

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 50bc7578d79..eb2dd7f63d5 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -1143,8 +1143,7 @@ void cudaq::bindAltLaunchKernel(nanobind::module_ &mod,
       .def_prop_ro(
           "entry_point",
           [](const cudaq::CompiledModule &ck) {
-            return reinterpret_cast<std::uintptr_t>(
-                ck.getJit().getEntryPoint());
+            return reinterpret_cast<std::uintptr_t>(ck.getJit()->getFn());
           },
           "The address of the JIT-compiled entry point.")
       .def_prop_ro("is_fully_specialized",
diff --git a/runtime/common/CompiledModule.cpp b/runtime/common/CompiledModule.cpp
index 958ec55be4f..69fe55da0e0 100644
--- a/runtime/common/CompiledModule.cpp
+++ b/runtime/common/CompiledModule.cpp
@@ -7,47 +7,58 @@
  ******************************************************************************/
 
 #include "CompiledModule.h"
-#include <cstring>
-#include <memory>
 #include <stdexcept>
 
 cudaq::CompiledModule::CompiledModule(std::string kernelName)
     : name(std::move(kernelName)) {}
 
-const cudaq::CompiledModule::JitArtifact &
-cudaq::CompiledModule::getJit() const {
-  for (auto &[key, artifact] : artifacts)
-    if (auto *jit = std::get_if<JitArtifact>(&artifact))
-      return *jit;
-  throw std::runtime_error("CompiledModule has no JIT artifact.");
+std::optional<cudaq::CompiledModule::JitArtifact>
+cudaq::CompiledModule::getJit(std::optional<std::string> jitName) const {
+  auto name = jitName.value_or(this->name);
+  auto it = artifacts.find(name);
+  if (it == artifacts.end())
+    return std::nullopt;
+  const auto *jit = std::get_if<JitArtifact>(&it->second);
+  return jit ? std::optional(*jit) : std::nullopt;
 }
 
-const cudaq::CompiledModule::MlirArtifact &
-cudaq::CompiledModule::getMlir() const {
-  for (auto &[key, artifact] : artifacts)
-    if (auto *mlir = std::get_if<MlirArtifact>(&artifact))
-      return *mlir;
-  throw std::runtime_error("CompiledModule has no MLIR artifact.");
+std::optional<cudaq::CompiledModule::MlirArtifact>
+cudaq::CompiledModule::getMlir(std::optional<std::string> mlirName) const {
+  auto name = mlirName.value_or(this->name + ".mlir");
+  auto it = artifacts.find(name);
+  if (it == artifacts.end())
+    return std::nullopt;
+  const auto *mlir = std::get_if<MlirArtifact>(&it->second);
+  return mlir ? std::optional(*mlir) : std::nullopt;
 }
 
-bool cudaq::CompiledModule::hasJit() const {
-  for (auto &[key, artifact] : artifacts)
-    if (std::holds_alternative<JitArtifact>(artifact))
-      return true;
-  return false;
+bool cudaq::CompiledModule::isFullySpecialized() const {
+  return getArgsCreator() == nullptr;
 }
 
-bool cudaq::CompiledModule::hasMlir() const {
-  for (auto &[key, artifact] : artifacts)
-    if (std::holds_alternative<MlirArtifact>(artifact))
-      return true;
-  return false;
+int64_t (*cudaq::CompiledModule::getArgsCreator() const)(const void *,
+                                                         void **) {
+  auto jit = getJit(name + ".argsCreator");
+  return jit ? reinterpret_cast<int64_t (*)(const void *, void **)>(jit->fn)
+             : nullptr;
 }
 
-bool cudaq::CompiledModule::isFullySpecialized() const {
-  if (!hasJit())
-    return true; // No JIT artifact → fully specialized.
-  return getJit().argsCreator == nullptr;
+std::optional<std::int64_t> cudaq::CompiledModule::getReturnOffset() const {
+  auto jit = getJit(name + ".returnOffset");
+  if (!jit)
+    return std::nullopt;
+  auto fn = reinterpret_cast<std::int64_t (*)()>(jit->fn);
+  return fn();
+}
+
+const cudaq::Resources *cudaq::CompiledModule::getResources(
+    std::optional<std::string> resourcesName) const {
+  auto name = resourcesName.value_or(this->name + ".resources");
+  auto it = artifacts.find(name);
+  if (it == artifacts.end())
+    return nullptr;
+  const auto *resources = std::get_if<ResourcesArtifact>(&it->second);
+  return resources ? &resources->getResources() : nullptr;
 }
 
 void cudaq::CompiledModule::addArtifact(std::string name,
@@ -57,62 +68,8 @@ void cudaq::CompiledModule::addArtifact(std::string name,
   artifacts.emplace(std::move(name), std::move(artifact));
 }
 
-cudaq::KernelThunkResultType
-cudaq::CompiledModule::execute(const std::vector<void *> &rawArgs) const {
-  auto &jit = getJit();
-  auto funcPtr = jit.entryPoint;
-  if (!isFullySpecialized()) {
-    // Pack args at runtime via argsCreator, then call the thunk.
-    void *buff = nullptr;
-    jit.argsCreator(static_cast<const void *>(rawArgs.data()), &buff);
-    reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
-        buff, /*client_server=*/false);
-    // If the kernel has a result, copy it from the packed buffer into
-    // rawArgs.back() (where the caller expects to find it).
-    if (resultInfo.hasResult()) {
-      auto offset = jit.returnOffset();
-      std::memcpy(rawArgs.back(), static_cast<char *>(buff) + offset,
-                  resultInfo.bufferSize);
-    }
-    std::free(buff);
-    return {nullptr, 0};
-  }
-  if (resultInfo.hasResult()) {
-    // Fully specialized with result: rawArgs.back() is the pre-allocated
-    // result buffer; pass it directly to the thunk.
-    void *buff = const_cast<void *>(rawArgs.back());
-    return reinterpret_cast<KernelThunkResultType (*)(void *, bool)>(funcPtr)(
-        buff, /*client_server=*/false);
-  }
-  // Fully specialized, no result.
-  jit.entryPoint();
-  return {nullptr, 0};
-}
-
-cudaq::KernelThunkResultType cudaq::CompiledModule::execute() const {
-  if (!isFullySpecialized())
-    throw std::runtime_error(
-        "Kernel has unspecialized parameters; call execute(rawArgs) instead.");
-  if (!resultInfo.hasResult()) {
-    getJit().entryPoint();
-    return {nullptr, 0};
-  }
-  // Allocate a result buffer on-the-fly.
-  auto buf = std::make_unique<char[]>(resultInfo.bufferSize);
-  std::vector<void *> rawArgs = {buf.get()};
-  execute(rawArgs);
-  return {buf.release(), resultInfo.bufferSize};
-}
-
-void (*cudaq::CompiledModule::JitArtifact::getEntryPoint() const)() {
-  return entryPoint;
-}
+void (*cudaq::CompiledModule::JitArtifact::getFn() const)() { return fn; }
 
 cudaq::JitEngine cudaq::CompiledModule::JitArtifact::getEngine() const {
   return engine;
 }
-
-std::optional<cudaq::Resources>
-cudaq::CompiledModule::JitArtifact::getResourceCounts() const {
-  return resourceCounts;
-}
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index f0dd49b3bfd..52d8688ed71 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -87,13 +87,15 @@ class ResultInfo {
 public:
   /// Whether this kernel has a result that must be marshaled.
   bool hasResult() const { return typeOpaquePtr != nullptr; }
+  /// Get the size (in bytes) of the buffer needed to hold the result value.
+  std::size_t getBufferSize() const { return bufferSize; }
 };
 
 /// @brief A compiled MLIR module, ready for execution or code generation.
 ///
 /// Contains any number of named compilation artifacts (we currently support
-/// JIT binaries and optimized MLIR modules) that result from the compilation
-/// of a Quake MLIR module.
+/// JIT binaries, optimized MLIR modules, and pre-computed resource metrics)
+/// that result from the compilation of a Quake MLIR module.
 ///
 /// This type does not depend on MLIR/LLVM — it only keeps type-erased / opaque
 /// pointers. Build instances with
@@ -105,46 +107,18 @@ class CompiledModule {
   /// JIT-compiled artifact, ready for local execution.
   class JitArtifact {
     JitEngine engine;
-    void (*entryPoint)() = nullptr;
-    std::int64_t (*argsCreator)(const void *, void **) = nullptr;
-    /// Offset (in bytes) of the result field within the argsCreator-packed
-    /// buffer. Only valid when argsCreator is non-null and the kernel has a
-    /// result. Use resultInfo.bufferSize to know how many bytes to copy.
-    std::int64_t (*returnOffset)() = nullptr;
-    std::optional<Resources> resourceCounts;
-
-    JitArtifact(JitEngine engine, void (*entryPoint)(),
-                int64_t (*argsCreator)(const void *, void **),
-                int64_t (*returnOffset)(),
-                std::optional<Resources> resourceCounts)
-        : engine(engine), entryPoint(entryPoint), argsCreator(argsCreator),
-          returnOffset(returnOffset),
-          resourceCounts(std::move(resourceCounts)) {}
+    void (*fn)() = nullptr;
+
+    JitArtifact(JitEngine engine, void (*fn)())
+        : engine(std::move(engine)), fn(fn) {}
 
     friend class CompiledModule;
     friend class cudaq_internal::compiler::CompiledModuleHelper;
 
   public:
-    // TODO: remove the following two methods once the `CompiledModule` instance
-    // is returned to Python.
-
-    /// @brief Get the entry point of the kernel as a function pointer.
-    ///
-    /// Assumes that there is (exactly one) compiled JIT artifact.
-    ///
-    /// The returned function pointer will expect different arguments depending
-    /// on the kernel:
-    ///  - if the kernel returns a value and/or is not fully specialized, the
-    ///    entry point will expect a pointer to a buffer storing the packed
-    ///    arguments and result.
-    ///  - otherwise, the entry point will not expect any arguments.
-    ///
-    /// Prefer using `CompiledModule::execute` instead of calling this function
-    /// as it will handle the buffer and argument packing automatically.
-    void (*getEntryPoint() const)();
+    /// Get the raw function pointer stored in this artifact.
+    void (*getFn() const)();
     JitEngine getEngine() const;
-
-    std::optional<Resources> getResourceCounts() const;
   };
 
   /// Optimized MLIR module artifact, for deferred code generation or
@@ -165,8 +139,22 @@ class CompiledModule {
     friend class cudaq_internal::compiler::CompiledModuleHelper;
   };
 
-  /// A compiled artifact is either a JIT binary or an MLIR module.
-  using CompiledArtifact = std::variant<JitArtifact, MlirArtifact>;
+  /// Pre-computed resource metrics (gate counts, depth) from IR analysis.
+  class ResourcesArtifact {
+    Resources resources;
+
+    ResourcesArtifact(Resources resources) : resources(std::move(resources)) {}
+
+    friend class CompiledModule;
+    friend class cudaq_internal::compiler::CompiledModuleHelper;
+
+  public:
+    const Resources &getResources() const { return resources; }
+  };
+
+  /// A compiled artifact is a JIT binary, an MLIR module, or resource metrics.
+  using CompiledArtifact =
+      std::variant<JitArtifact, MlirArtifact, ResourcesArtifact>;
 
   // --- Compilation metadata ---
 
@@ -178,47 +166,51 @@ class CompiledModule {
 
   // --- Queries ---
 
-  /// Whether any artifact in the map is a JitArtifact.
-  bool hasJit() const;
-
-  /// Whether any artifact in the map is an MlirArtifact.
-  bool hasMlir() const;
+  /// Get the JIT artifact with the given name.
+  ///
+  /// If no name is provided, defaults to the kernel name.
+  std::optional<JitArtifact>
+  getJit(std::optional<std::string> jitName = std::nullopt) const;
 
-  /// Get the compiled JIT artifact. Returns the first one found.
+  /// Get the MLIR artifact with the given name.
   ///
-  /// Throws if none exists.
-  const JitArtifact &getJit() const;
+  /// If no name is provided, defaults to `kernel_name + ".mlir"`.
+  std::optional<MlirArtifact>
+  getMlir(std::optional<std::string> mlirName = std::nullopt) const;
 
-  /// Get the optimized MLIR artifact. Returns the first one found.
+  /// Get the pre-computed resource counts, or `nullptr` if it does not exist.
   ///
-  /// Throws if none exists.
-  const MlirArtifact &getMlir() const;
+  /// If no name is provided, defaults to `kernel_name + ".resources"`.
+  const Resources *
+  getResources(std::optional<std::string> resourcesName = std::nullopt) const;
 
   /// Get all compiled artifacts.
   const std::map<std::string, CompiledArtifact> &getArtifacts() const {
     return artifacts;
   }
 
-  /// Whether the kernel is fully specialized (all arguments inlined). For JIT
-  /// kernels this means `argsCreator` is null.
-  /// Kernels without a JIT artifact are considered fully specialized.
+  /// Whether the kernel is fully specialized (all arguments inlined).
+  ///
+  /// Currently, kernels are considered fully specialized if and only if they do
+  /// not have an `argsCreator` artifact.
   bool isFullySpecialized() const;
 
-  const std::string &getName() const { return name; }
-  const ResultInfo &getResultInfo() const { return resultInfo; }
-  const CompilationMetadata &getMetadata() const { return metadata; }
-
-  // --- Execution (local JIT path) ---
-
-  /// @brief Execute a fully specialized kernel (no external arguments needed).
+  /// Get the argument-marshaling function, or `nullptr` if it does not exist.
   ///
-  /// Assumes that there is (exactly one) compiled JIT artifact.
-  KernelThunkResultType execute() const;
+  /// Assumes the artifact is named `kernelName + ".argsCreator"`.
+  int64_t (*getArgsCreator() const)(const void *, void **);
 
-  /// @brief Execute the JIT-ed kernel with caller-provided arguments.
+  /// Get the offset (in bytes) of the result field within the
+  /// `argsCreator`-packed buffer, evaluating the stored JIT function.
+  /// Returns `std::nullopt` if no `.returnOffset` artifact was emitted
+  /// (e.g. the kernel has no result or is fully specialized).
   ///
-  /// Assumes that there is (exactly one) compiled JIT artifact.
-  KernelThunkResultType execute(const std::vector<void *> &rawArgs) const;
+  /// Assumes the artifact is named `kernelName + ".returnOffset"`.
+  std::optional<std::int64_t> getReturnOffset() const;
+
+  const std::string &getName() const { return name; }
+  const ResultInfo &getResultInfo() const { return resultInfo; }
+  const CompilationMetadata &getMetadata() const { return metadata; }
 
 private:
   friend class cudaq_internal::compiler::CompiledModuleHelper;
diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
index 38e1bd9ced8..4d7b059695f 100644
--- a/runtime/cudaq/platform/qpu.cpp
+++ b/runtime/cudaq/platform/qpu.cpp
@@ -8,11 +8,50 @@
 
 #include "qpu.h"
 #include "mlir/IR/BuiltinOps.h"
+#include <cstring>
 
 using namespace cudaq_internal::compiler;
 
 LLVM_INSTANTIATE_REGISTRY(cudaq::ModuleLauncher::RegistryType)
 
+/// Execute a JIT-compiled kernel with provided arguments.
+///
+/// Handles argument marshaling via `argsCreator` (if not fully specialized) and
+/// result buffer allocation.
+cudaq::KernelThunkResultType
+launchCompiledModule(const cudaq::CompiledModule &compiled,
+                     const std::vector<void *> &rawArgs) {
+  auto funcPtr = compiled.getJit()->getFn();
+  const auto &resultInfo = compiled.getResultInfo();
+  if (!compiled.isFullySpecialized()) {
+    // Pack args at runtime via argsCreator, then call the thunk.
+    auto argsCreator = compiled.getArgsCreator();
+    void *buff = nullptr;
+    argsCreator(static_cast<const void *>(rawArgs.data()), &buff);
+    reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(funcPtr)(
+        buff, /*client_server=*/false);
+    // If the kernel has a result, copy it from the packed buffer into
+    // rawArgs.back() (where the caller expects to find it).
+    if (resultInfo.hasResult()) {
+      auto offset = compiled.getReturnOffset().value();
+      std::memcpy(rawArgs.back(), static_cast<char *>(buff) + offset,
+                  resultInfo.getBufferSize());
+    }
+    std::free(buff);
+    return {nullptr, 0};
+  }
+  if (resultInfo.hasResult()) {
+    // Fully specialized with result: rawArgs.back() is the pre-allocated
+    // result buffer; pass it directly to the thunk.
+    void *buff = const_cast<void *>(rawArgs.back());
+    return reinterpret_cast<cudaq::KernelThunkResultType (*)(void *, bool)>(
+        funcPtr)(buff, /*client_server=*/false);
+  }
+  // Fully specialized, no result.
+  funcPtr();
+  return {nullptr, 0};
+}
+
 cudaq::KernelThunkResultType
 cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
                          const std::vector<void *> &rawArgs) {
@@ -23,7 +62,7 @@ cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
         "result of attempting to use `launchModule` outside Python.");
   ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule", name);
   auto compiled = launcher->compileModule(name, module, rawArgs, true);
-  return compiled.execute(rawArgs);
+  return launchCompiledModule(compiled, rawArgs);
 }
 
 cudaq::CompiledModule
diff --git a/runtime/internal/compiler/CompiledModuleHelper.cpp b/runtime/internal/compiler/CompiledModuleHelper.cpp
index 2e36d74cd1a..d92e1e82e66 100644
--- a/runtime/internal/compiler/CompiledModuleHelper.cpp
+++ b/runtime/internal/compiler/CompiledModuleHelper.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/Types.h"
 
 using namespace mlir;
+using cudaq::CompiledModule;
 
 namespace cudaq_internal::compiler {
 
@@ -32,75 +33,64 @@ cudaq::ResultInfo CompiledModuleHelper::createResultInfo(Type resultTy,
   return info;
 }
 
-std::vector<CompiledModuleHelper::NamedJitArtifact>
-CompiledModuleHelper::createJitArtifacts(
-    const std::string &kernelName, cudaq::JitEngine engine,
-    const cudaq::ResultInfo &resultInfo, bool isFullySpecialized,
-    std::optional<cudaq::Resources> resourceCounts) {
+std::vector<CompiledModuleHelper::NamedCompiledArtifact>
+CompiledModuleHelper::createJitArtifacts(const std::string &kernelName,
+                                         cudaq::JitEngine engine,
+                                         const cudaq::ResultInfo &resultInfo,
+                                         bool isFullySpecialized) {
   bool hasResult = resultInfo.hasResult();
   std::string fullName =
       std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName;
   std::string entryName =
       (hasResult || !isFullySpecialized) ? kernelName + ".thunk" : fullName;
   void (*entryPoint)() = engine.lookupRawNameOrFail(entryName);
-  int64_t (*argsCreator)(const void *, void **) = nullptr;
-  int64_t (*returnOffset)() = nullptr;
-  if (!isFullySpecialized) {
-    argsCreator = reinterpret_cast<int64_t (*)(const void *, void **)>(
-        engine.lookupRawNameOrFail(kernelName + ".argsCreator"));
-    if (hasResult)
-      returnOffset = reinterpret_cast<int64_t (*)()>(
-          engine.lookupRawNameOrFail(kernelName + ".returnOffset"));
-  }
 
-  std::vector<NamedJitArtifact> artifacts;
+  std::vector<NamedCompiledArtifact> artifacts;
   artifacts.emplace_back(kernelName,
-                         cudaq::CompiledModule::JitArtifact{
-                             std::move(engine), entryPoint, argsCreator,
-                             returnOffset, std::move(resourceCounts)});
+                         CompiledModule::JitArtifact{engine, entryPoint});
+  if (!isFullySpecialized) {
+    void (*argsCreatorFn)() =
+        engine.lookupRawNameOrFail(kernelName + ".argsCreator");
+    artifacts.emplace_back(kernelName + ".argsCreator",
+                           CompiledModule::JitArtifact{engine, argsCreatorFn});
+    if (hasResult) {
+      void (*returnOffsetFn)() =
+          engine.lookupRawNameOrFail(kernelName + ".returnOffset");
+      artifacts.emplace_back(
+          kernelName + ".returnOffset",
+          CompiledModule::JitArtifact{engine, returnOffsetFn});
+    }
+  }
   return artifacts;
 }
 
-CompiledModuleHelper::NamedMlirArtifact
+CompiledModuleHelper::NamedCompiledArtifact
+CompiledModuleHelper::createResourcesArtifact(std::string name,
+                                              cudaq::Resources rc) {
+  return {std::move(name), CompiledModule::ResourcesArtifact{std::move(rc)}};
+}
+
+CompiledModuleHelper::NamedCompiledArtifact
 CompiledModuleHelper::createMlirArtifact(std::string name, ModuleOp module,
                                          std::shared_ptr<MLIRContext> context) {
   const void *ptr = module.getAsOpaquePointer();
   return {std::move(name),
-          cudaq::CompiledModule::MlirArtifact{ptr, std::move(context)}};
+          CompiledModule::MlirArtifact{ptr, std::move(context)}};
 }
 
 ModuleOp CompiledModuleHelper::getMlirModuleOp(
-    const cudaq::CompiledModule::MlirArtifact &artifact) {
+    const CompiledModule::MlirArtifact &artifact) {
   return ModuleOp::getFromOpaquePointer(artifact.modulePtr);
 }
 
-cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
+CompiledModule CompiledModuleHelper::createCompiledModule(
     std::string name, cudaq::ResultInfo resultInfo,
-    std::vector<NamedJitArtifact> jitArtifacts,
-    cudaq::CompiledModule::CompilationMetadata metadata) {
-  return createCompiledModule(std::move(name), std::move(resultInfo),
-                              std::move(jitArtifacts), {}, std::move(metadata));
-}
-
-cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
-    std::string name, cudaq::ResultInfo resultInfo,
-    std::vector<NamedMlirArtifact> mlirArtifacts,
-    cudaq::CompiledModule::CompilationMetadata metadata) {
-  return createCompiledModule(std::move(name), std::move(resultInfo), {},
-                              std::move(mlirArtifacts), std::move(metadata));
-}
-
-cudaq::CompiledModule CompiledModuleHelper::createCompiledModule(
-    std::string name, cudaq::ResultInfo resultInfo,
-    std::vector<NamedJitArtifact> jitArtifacts,
-    std::vector<NamedMlirArtifact> mlirArtifacts,
-    cudaq::CompiledModule::CompilationMetadata metadata) {
-  cudaq::CompiledModule compiled(std::move(name));
+    std::vector<NamedCompiledArtifact> compiledArtifacts,
+    CompiledModule::CompilationMetadata metadata) {
+  CompiledModule compiled(std::move(name));
   compiled.resultInfo = std::move(resultInfo);
   compiled.metadata = std::move(metadata);
-  for (auto &[artName, artifact] : jitArtifacts)
-    compiled.addArtifact(std::move(artName), std::move(artifact));
-  for (auto &[artName, artifact] : mlirArtifacts)
+  for (auto &[artName, artifact] : compiledArtifacts)
     compiled.addArtifact(std::move(artName), std::move(artifact));
   return compiled;
 }
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index d58d78bede3..e94cf2a5fab 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -470,17 +470,20 @@ cudaq::CompiledModule Compiler::runPassPipeline(
 
   // For emulation or resource counting: create JIT artifacts before
   // applying combine-measurements (so the JIT sees un-combined measurements).
-  std::vector<CompiledModuleHelper::NamedJitArtifact> jitArtifacts;
+  std::vector<CompiledModuleHelper::NamedCompiledArtifact> artifacts;
   if (emulate ||
       (executionContext && executionContext->name == "resource-count")) {
     for (auto &[name, module] : modules) {
       auto clonedModule = module.clone();
-      auto artifacts = CompiledModuleHelper::createJitArtifacts(
+      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
           kernelName, createJITEngine(clonedModule, codegenTranslation), {},
-          /*isFullySpecialized=*/true, std::move(resourceCounts));
-      assert(artifacts.size() == 1);
-      artifacts[0].first = name;
-      jitArtifacts.push_back(std::move(artifacts[0]));
+          /*isFullySpecialized=*/true);
+      assert(jitArtifacts.size() == 1);
+      jitArtifacts[0].first = name;
+      artifacts.push_back(std::move(jitArtifacts[0]));
+      if (resourceCounts)
+        artifacts.push_back(CompiledModuleHelper::createResourcesArtifact(
+            name + ".resources", std::move(*resourceCounts)));
     }
   }
 
@@ -488,15 +491,14 @@ cudaq::CompiledModule Compiler::runPassPipeline(
     for (auto &[name, module] : modules)
       runPassPipeline("func.func(combine-measurements)", module);
 
-  std::vector<CompiledModuleHelper::NamedMlirArtifact> mlirArtifacts;
   for (auto &[name, module] : modules) {
     auto mlirName = name + ".mlir"; // distinguish MLIR and JIT artifacts
-    mlirArtifacts.push_back(
+    artifacts.push_back(
         CompiledModuleHelper::createMlirArtifact(mlirName, module, context));
   }
 
   return CompiledModuleHelper::createCompiledModule(
-      kernelName, {}, std::move(jitArtifacts), std::move(mlirArtifacts),
+      kernelName, {}, std::move(artifacts),
       {.reorderIdx = mapping_reorder_idx});
 }
 
@@ -538,13 +540,12 @@ Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
     std::optional<cudaq::JitEngine> optionalJit;
     std::optional<cudaq::Resources> optionalResourceCounts;
     auto kernelName = name.substr(0, name.length() - 5);
-    auto it = compiled.getArtifacts().find(kernelName);
-    if (it != compiled.getArtifacts().end()) {
-      const auto &jit =
-          std::get<cudaq::CompiledModule::JitArtifact>(it->second);
-      optionalJit = jit.getEngine();
-      optionalResourceCounts = jit.getResourceCounts();
-    }
+    auto jit = compiled.getJit(kernelName);
+    if (jit)
+      optionalJit = jit->getEngine();
+    auto resourceCounts = compiled.getResources(kernelName + ".resources");
+    if (resourceCounts)
+      optionalResourceCounts = *resourceCounts;
 
     auto mapping_reorder_idx = compiled.getMetadata().reorderIdx;
     codes.emplace_back(kernelName, codeStr, optionalJit, optionalResourceCounts,
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
index cfd5732061f..e04383d1214 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
@@ -25,10 +25,8 @@ class CompiledModuleHelper {
 public:
   // --- Named artifact aliases ---
 
-  using NamedJitArtifact =
-      std::pair<std::string, cudaq::CompiledModule::JitArtifact>;
-  using NamedMlirArtifact =
-      std::pair<std::string, cudaq::CompiledModule::MlirArtifact>;
+  using NamedCompiledArtifact =
+      std::pair<std::string, cudaq::CompiledModule::CompiledArtifact>;
 
   CompiledModuleHelper() = delete;
 
@@ -48,18 +46,22 @@ class CompiledModuleHelper {
   /// engine.
   ///
   /// Uses the kernel's name and result metadata to determine the correct
-  /// mangled symbol names. Currently returns one artifact.
-  ///
-  /// Optionally, a \p resourceCounts can be attached to the returned artifact.
-  static std::vector<NamedJitArtifact> createJitArtifacts(
-      const std::string &kernelName, cudaq::JitEngine engine,
-      const cudaq::ResultInfo &resultInfo, bool isFullySpecialized,
-      std::optional<cudaq::Resources> resourceCounts = std::nullopt);
+  /// mangled symbol names.
+  static std::vector<NamedCompiledArtifact>
+  createJitArtifacts(const std::string &kernelName, cudaq::JitEngine engine,
+                     const cudaq::ResultInfo &resultInfo,
+                     bool isFullySpecialized);
+
+  // --- ResourcesArtifact construction ---
+
+  /// Construct a named `ResourcesArtifact` from pre-computed resource counts.
+  static NamedCompiledArtifact createResourcesArtifact(std::string name,
+                                                       cudaq::Resources rc);
 
   // --- MlirArtifact construction and access ---
 
   /// Construct a named `MlirArtifact` from a `ModuleOp`.
-  static NamedMlirArtifact
+  static NamedCompiledArtifact
   createMlirArtifact(std::string name, mlir::ModuleOp module,
                      std::shared_ptr<mlir::MLIRContext> context = nullptr);
 
@@ -69,23 +71,10 @@ class CompiledModuleHelper {
 
   // --- CompiledModule construction ---
 
-  /// Create a `CompiledModule` containing only JIT artifacts.
-  static cudaq::CompiledModule createCompiledModule(
-      std::string name, cudaq::ResultInfo resultInfo,
-      std::vector<NamedJitArtifact> jitArtifacts,
-      cudaq::CompiledModule::CompilationMetadata metadata = {});
-
-  /// Create a `CompiledModule` containing only MLIR artifacts.
-  static cudaq::CompiledModule createCompiledModule(
-      std::string name, cudaq::ResultInfo resultInfo,
-      std::vector<NamedMlirArtifact> mlirArtifacts,
-      cudaq::CompiledModule::CompilationMetadata metadata = {});
-
-  /// Create a `CompiledModule` containing both JIT and MLIR artifacts.
+  /// Create a `CompiledModule` containing the given compiled artifacts.
   static cudaq::CompiledModule createCompiledModule(
       std::string name, cudaq::ResultInfo resultInfo,
-      std::vector<NamedJitArtifact> jitArtifacts,
-      std::vector<NamedMlirArtifact> mlirArtifacts,
+      std::vector<NamedCompiledArtifact> compiledArtifacts,
       cudaq::CompiledModule::CompilationMetadata metadata = {});
 };
 

From a18a4000a23a2ed55594bd3a119548f5fabc9d1a Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Wed, 22 Apr 2026 03:24:05 -0700
Subject: [PATCH 38/85] Using Eigen matrix multiplication (#4361)

Using Eigen matrix multiplication in complex_matrix::operator*=. The 3
layer for-loop in `operator*=` was the dominat cost in
`sum_op::to_matrix()` for multi-qubits operator chains.

Using Eigen matrix drops the execution time on my machine from **87
seconds to 2 seconds**, speedup of **~40x**.

Fixes #3626

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 runtime/cudaq/operators/matrix.cpp | 43 +++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/runtime/cudaq/operators/matrix.cpp b/runtime/cudaq/operators/matrix.cpp
index cb7e6e1cd11..e24ca74db52 100644
--- a/runtime/cudaq/operators/matrix.cpp
+++ b/runtime/cudaq/operators/matrix.cpp
@@ -168,15 +168,40 @@ cudaq::complex_matrix::operator*=(const cudaq::complex_matrix &right) {
   if (cols() != right.rows())
     throw std::runtime_error("matrix dimensions mismatch in operator*=");
 
-  auto new_data =
-      new cudaq::complex_matrix::value_type[rows() * right.cols()]();
-  cudaq::complex_matrix::Dimensions new_dims = {rows(), right.cols()};
-  for (std::size_t i = 0; i < rows(); i++)
-    for (std::size_t j = 0; j < right.cols(); j++)
-      for (std::size_t k = 0; k < cols(); k++)
-        access(new_data, new_dims, i, j, this->internal_order) +=
-            access(data, dimensions, i, k, this->internal_order) *
-            access(right.data, right.dimensions, k, j, right.internal_order);
+  const std::size_t new_rows = rows();
+  const std::size_t new_cols = right.cols();
+  auto *new_data = new cudaq::complex_matrix::value_type[new_rows * new_cols];
+  cudaq::complex_matrix::Dimensions new_dims = {new_rows, new_cols};
+
+  using RowMat = Eigen::Matrix<value_type, -1, -1, Eigen::RowMajor, -1, -1>;
+  using ColMat = Eigen::Matrix<value_type, -1, -1, Eigen::ColMajor, -1, -1>;
+
+  auto assign_product = [&](auto &&lhs_map, auto &&rhs_map) {
+    if (this->internal_order == cudaq::complex_matrix::order::row_major)
+      Eigen::Map<RowMat>(new_data, new_rows, new_cols).noalias() =
+          lhs_map * rhs_map;
+    else
+      Eigen::Map<ColMat>(new_data, new_rows, new_cols).noalias() =
+          lhs_map * rhs_map;
+  };
+
+  const bool l_row =
+      this->internal_order == cudaq::complex_matrix::order::row_major;
+  const bool r_row =
+      right.internal_order == cudaq::complex_matrix::order::row_major;
+  Eigen::Map<const RowMat> l_row_map(this->data, this->rows(), this->cols());
+  Eigen::Map<const ColMat> l_col_map(this->data, this->rows(), this->cols());
+  Eigen::Map<const RowMat> r_row_map(right.data, right.rows(), right.cols());
+  Eigen::Map<const ColMat> r_col_map(right.data, right.rows(), right.cols());
+  if (l_row && r_row)
+    assign_product(l_row_map, r_row_map);
+  else if (l_row && !r_row)
+    assign_product(l_row_map, r_col_map);
+  else if (!l_row && r_row)
+    assign_product(l_col_map, r_row_map);
+  else
+    assign_product(l_col_map, r_col_map);
+
   swap(new_data);
   dimensions = new_dims;
   return *this;

From 033cab83ba706a33a9a7fdb0378a9cb96e7fef3c Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Wed, 22 Apr 2026 11:59:25 -0700
Subject: [PATCH 39/85] schema_name -> schema_id in RecordLogParser (#4366)

schema_name -> schema_id in RecordLogParser

Fixes #3815

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 runtime/common/RecordLogParser.cpp             | 2 +-
 unittests/output_record/RecordParserTester.cpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/runtime/common/RecordLogParser.cpp b/runtime/common/RecordLogParser.cpp
index 27cf35ade11..b2a83145e31 100644
--- a/runtime/common/RecordLogParser.cpp
+++ b/runtime/common/RecordLogParser.cpp
@@ -68,7 +68,7 @@ void cudaq::RecordLogParser::handleHeader(
     const std::vector<std::string> &entries) {
   if (entries.size() < 3)
     throw std::runtime_error("Invalid HEADER record");
-  if (entries[1] == "schema_name") {
+  if (entries[1] == "schema_id") {
     if (entries[2] == "labeled")
       schema = RecordSchemaType::LABELED;
     else if (entries[2] == "ordered")
diff --git a/unittests/output_record/RecordParserTester.cpp b/unittests/output_record/RecordParserTester.cpp
index ddc14d981bd..ad4779148e6 100644
--- a/unittests/output_record/RecordParserTester.cpp
+++ b/unittests/output_record/RecordParserTester.cpp
@@ -253,7 +253,7 @@ CUDAQ_TEST(ParserTester, checkTupleLabeled) {
 }
 
 CUDAQ_TEST(ParserTester, checkMultipleShots) {
-  const std::string log = "HEADER\tschema_name\tlabeled\n"
+  const std::string log = "HEADER\tschema_id\tlabeled\n"
                           "START\n"
                           "METADATA\tqir_profiles\tbase_profile\n"
                           "OUTPUT\tARRAY\t2\tarray<i16 x 2>\n"
@@ -392,7 +392,7 @@ CUDAQ_TEST(ParserTester, checkFailureCases) {
   }
   {
     const std::string invalidSchema =
-        "HEADER\tschema_name\tordered_and_labeled\n";
+        "HEADER\tschema_id\tordered_and_labeled\n";
     EXPECT_ANY_THROW(parser.parse(invalidSchema));
   }
   {
@@ -577,7 +577,7 @@ CUDAQ_TEST(ParserTester, checkFailedShot_0) {
 }
 
 CUDAQ_TEST(ParserTester, checkFailedShot_1) {
-  const std::string log = "HEADER\tschema_name\tlabeled\n"
+  const std::string log = "HEADER\tschema_id\tlabeled\n"
                           "START\n"
                           "OUTPUT\tARRAY\t2\tarray<i16 x 2>\n"
                           "OUTPUT\tINT\t2345\t[0]\n"

From b29e4c4afb4c253cbc4adc388b0d4a087e35c969 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Thu, 23 Apr 2026 06:28:30 +1000
Subject: [PATCH 40/85] [Testing] Add tests for custom op in nested kernels
 (#4362)

Adds regression tests for issues reported in
https://github.com/NVIDIA/cuda-quantum/issues/2485, which have since
been fixed.

Resolved https://github.com/NVIDIA/cuda-quantum/issues/2485

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/tests/custom/test_custom_operations.py | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/tests/custom/test_custom_operations.py b/python/tests/custom/test_custom_operations.py
index 01df0fae971..8d25c85d418 100644
--- a/python/tests/custom/test_custom_operations.py
+++ b/python/tests/custom/test_custom_operations.py
@@ -371,6 +371,41 @@ def kernel(n: int):
         error)
 
 
+def test_nested_kernel_single_qubit():
+    """Regression test for issue #2485: custom op in a nested kernel on a single qubit."""
+    cudaq.register_operation("custom_x_nested", np.array([0, 1, 1, 0]))
+
+    @cudaq.kernel
+    def inner(q: cudaq.qubit):
+        custom_x_nested(q)
+
+    @cudaq.kernel
+    def outer():
+        q = cudaq.qubit()
+        inner(q)
+
+    counts = cudaq.sample(outer, shots_count=100)
+    assert counts["1"] == 100
+
+
+def test_nested_kernel_qview():
+    """Regression test for issue #2485: custom op in a nested kernel on a qview."""
+    cudaq.register_operation("custom_x_qview", np.array([0, 1, 1, 0]))
+
+    @cudaq.kernel
+    def inner(qubits: cudaq.qview):
+        for i in range(len(qubits)):
+            custom_x_qview(qubits[i])
+
+    @cudaq.kernel
+    def outer():
+        qubits = cudaq.qvector(2)
+        inner(qubits)
+
+    counts = cudaq.sample(outer, shots_count=100)
+    assert counts["11"] == 100
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)

From bead54c82d44fd5ef0e44b2d8a03a556e8656098 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Thu, 23 Apr 2026 08:37:34 +1000
Subject: [PATCH 41/85] [Testing] Add a regression test for `cudaq.control` on
 sub-kernl with `exp_pauli` in a loop  (#4360)

The issue reported in #2822 has been resolved.

This PR just adds a regression test for that particular case.

Note: we have a test for `cudaq.control` on `exp_pauli`. This adds one
more test case which contains `exp_pauli` in a for loop as reported in
that issue.

Resolved https://github.com/NVIDIA/cuda-quantum/issues/2822

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/tests/mlir/exp_pauli.py | 103 +++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/python/tests/mlir/exp_pauli.py b/python/tests/mlir/exp_pauli.py
index 85482237eec..41c5f691f7d 100644
--- a/python/tests/mlir/exp_pauli.py
+++ b/python/tests/mlir/exp_pauli.py
@@ -122,3 +122,106 @@ def kernel_noancilla_rotation(angles: list[float]):
 # CHECK:         call void @__quantum__qis__exp_pauli(double 2.310000e+01, %[[VAL_0]]* %[[VAL_1]], i8* %[[VAL_5]])
 # CHECK:         ret void
 # CHECK:       }
+
+
+def test_exp_pauli_loop_controlled():
+    """Regression test for issue #2822: cudaq.control on a kernel that calls
+    exp_pauli in a loop must compile without error."""
+
+    @cudaq.kernel
+    def exp_pauli_loop(qubits: cudaq.qview, coefficients: list[float],
+                       words: list[cudaq.pauli_word], time: float):
+        for i in range(len(coefficients)):
+            exp_pauli(coefficients[i] * time, qubits, words[i])
+
+    @cudaq.kernel
+    def kernel_controlled_exp_pauli_loop(coefficients: list[float],
+                                         words: list[cudaq.pauli_word]):
+        ctrl = cudaq.qubit()
+        qreg = cudaq.qvector(2)
+        h(ctrl)
+        cudaq.control(exp_pauli_loop, ctrl, qreg, coefficients, words, 1.0)
+
+    cudaq.set_target('qpp-cpu')
+    coefficients = [1.0, 0.5]
+    words = [cudaq.pauli_word("ZZ"), cudaq.pauli_word("XX")]
+
+    state = np.array(
+        cudaq.get_state(kernel_controlled_exp_pauli_loop, coefficients, words))
+    assert len(state) > 0
+
+    # FileCheck below verifies the QIR contains the loop structure
+    # (phi/icmp/br blocks) and calls __quantum__qis__exp_pauli__ctl per iteration.
+    print(
+        cudaq.translate(kernel_controlled_exp_pauli_loop,
+                        coefficients,
+                        words,
+                        format='qir'))
+
+
+# CHECK-LABEL: define void @__nvqpp__mlirgen__kernel_controlled_exp_pauli_loop..
+# CHECK:         %[[VAL_0:.*]] = alloca [1 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_1:.*]] = call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 3)
+# CHECK:         %[[VAL_3:.*]] = alloca [2 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_4:.*]] = bitcast [2 x { i8*, i64 }]* %[[VAL_3]] to { i8*, i64 }*
+# CHECK:         store { i8*, i64 } { i8* getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.5A5A00, i32 0, i32 0), i64 3 }, { i8*, i64 }* %[[VAL_4]], align 8
+# CHECK:         %[[VAL_5:.*]] = getelementptr [2 x { i8*, i64 }], [2 x { i8*, i64 }]* %[[VAL_3]], i32 0, i32 1
+# CHECK:         store { i8*, i64 } { i8* getelementptr inbounds ([3 x i8], [3 x i8]* @cstr.585800, i32 0, i32 0), i64 3 }, { i8*, i64 }* %[[VAL_5]], align 8
+# CHECK:         %[[VAL_6:.*]] = alloca [2 x double], align 8
+# CHECK:         %[[VAL_7:.*]] = bitcast [2 x double]* %[[VAL_6]] to double*
+# CHECK:         store double 1.000000e+00, double* %[[VAL_7]], align 8
+# CHECK:         %[[VAL_8:.*]] = getelementptr [2 x double], [2 x double]* %[[VAL_6]], i32 0, i32 1
+# CHECK:         store double 5.000000e-01, double* %[[VAL_8]], align 8
+# CHECK:         %[[VAL_9:.*]] = call %[[VAL_10:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
+# CHECK:         %[[VAL_11:.*]] = load %[[VAL_10]]*, %[[VAL_10]]** %[[VAL_9]], align 8
+# CHECK:         %[[VAL_12:.*]] = call %[[VAL_2]]* @__quantum__rt__array_slice(%[[VAL_2]]* %[[VAL_1]], i32 1, i64 1, i64 1, i64 2)
+# CHECK:         call void @__quantum__qis__h(%[[VAL_10]]* %[[VAL_11]])
+# CHECK:         %[[VAL_13:.*]] = call %[[VAL_2]]* @__quantum__rt__array_create_1d(i32 8, i64 1)
+# CHECK:         %[[VAL_14:.*]] = call %[[VAL_10:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_13]], i64 0)
+# CHECK:         store %[[VAL_10]]* %[[VAL_11]], %[[VAL_10]]** %[[VAL_14]], align 8
+# CHECK:         br label %[[VAL_15:.*]]
+# CHECK:       17:                                               ; preds = %[[VAL_16:.*]], %[[VAL_17:.*]]
+# CHECK:         %[[VAL_18:.*]] = phi i64 [ %[[VAL_19:.*]], %[[VAL_16]] ], [ 0, %[[VAL_17]] ]
+# CHECK:         %[[VAL_20:.*]] = icmp slt i64 %[[VAL_18]], 2
+# CHECK:         br i1 %[[VAL_20]], label %[[VAL_16]], label %[[VAL_21:.*]]
+# CHECK:       20:                                               ; preds = %[[VAL_15]]
+# CHECK:         %[[VAL_22:.*]] = phi i64 [ %[[VAL_18]], %[[VAL_15]] ]
+# CHECK:         %[[VAL_23:.*]] = getelementptr [2 x double], [2 x double]* %[[VAL_6]], i32 0, i64 %[[VAL_22]]
+# CHECK:         %[[VAL_24:.*]] = load double, double* %[[VAL_23]], align 8
+# CHECK:         %[[VAL_25:.*]] = getelementptr [2 x { i8*, i64 }], [2 x { i8*, i64 }]* %[[VAL_3]], i32 0, i64 %[[VAL_22]]
+# CHECK:         %[[VAL_26:.*]] = load { i8*, i64 }, { i8*, i64 }* %[[VAL_25]], align 8
+# CHECK:         %[[VAL_27:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_0]] to { i8*, i64 }*
+# CHECK:         store { i8*, i64 } %[[VAL_26]], { i8*, i64 }* %[[VAL_27]], align 8
+# CHECK:         %[[VAL_28:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_0]] to i8*
+# CHECK:         call void @__quantum__qis__exp_pauli__ctl(double %[[VAL_24]], %[[VAL_2]]* %[[VAL_13]], %[[VAL_2]]* %[[VAL_12]], i8* %[[VAL_28]])
+# CHECK:         %[[VAL_19]] = add i64 %[[VAL_22]], 1
+# CHECK:         br label %[[VAL_15]]
+# CHECK:       29:                                               ; preds = %[[VAL_15]]
+# CHECK:         call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
+# CHECK:         ret void
+
+# CHECK-LABEL: define void @__nvqpp__mlirgen__exp_pauli_loop..
+# CHECK:         %[[VAL_29:.*]] = alloca [1 x { i8*, i64 }], align 8
+# CHECK:         %[[VAL_30:.*]] = extractvalue { double*, i64 } %[[VAL_31:.*]], 1
+# CHECK:         br label %[[VAL_32:.*]]
+# CHECK:       7:                                                ; preds = %[[VAL_33:.*]], %[[VAL_34:.*]]
+# CHECK:         %[[VAL_35:.*]] = phi i64 [ %[[VAL_36:.*]], %[[VAL_33]] ], [ 0, %[[VAL_34]] ]
+# CHECK:         %[[VAL_37:.*]] = icmp slt i64 %[[VAL_35]], %[[VAL_30]]
+# CHECK:         br i1 %[[VAL_37]], label %[[VAL_33]], label %[[VAL_38:.*]]
+# CHECK:       10:                                               ; preds = %[[VAL_32]]
+# CHECK:         %[[VAL_39:.*]] = phi i64 [ %[[VAL_35]], %[[VAL_32]] ]
+# CHECK:         %[[VAL_40:.*]] = extractvalue { double*, i64 } %[[VAL_31]], 0
+# CHECK:         %[[VAL_41:.*]] = getelementptr double, double* %[[VAL_40]], i64 %[[VAL_39]]
+# CHECK:         %[[VAL_42:.*]] = load double, double* %[[VAL_41]], align 8
+# CHECK:         %[[VAL_43:.*]] = fmul double %[[VAL_42]], %[[VAL_44:.*]]
+# CHECK:         %[[VAL_45:.*]] = extractvalue { { i8*, i64 }*, i64 } %[[VAL_46:.*]], 0
+# CHECK:         %[[VAL_47:.*]] = getelementptr { i8*, i64 }, { i8*, i64 }* %[[VAL_45]], i64 %[[VAL_39]]
+# CHECK:         %[[VAL_48:.*]] = load { i8*, i64 }, { i8*, i64 }* %[[VAL_47]], align 8
+# CHECK:         %[[VAL_49:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_29]] to { i8*, i64 }*
+# CHECK:         store { i8*, i64 } %[[VAL_48]], { i8*, i64 }* %[[VAL_49]], align 8
+# CHECK:         %[[VAL_50:.*]] = bitcast [1 x { i8*, i64 }]* %[[VAL_29]] to i8*
+# CHECK:         call void @__quantum__qis__exp_pauli(double %[[VAL_43]], %[[VAL_51:.*]]* %[[VAL_52:.*]], i8* %[[VAL_50]])
+# CHECK:         %[[VAL_36]] = add i64 %[[VAL_39]], 1
+# CHECK:         br label %[[VAL_32]]
+# CHECK:       22:                                               ; preds = %[[VAL_32]]
+# CHECK:         ret void

From 8db3d8340efe629d148dd5109c21d652513c1af3 Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Wed, 22 Apr 2026 21:59:23 -0700
Subject: [PATCH 42/85] Fix opaque OSError when @cudaq.kernel is used in the
 REPL (#2593) (#4365)

This adds better error messages for #2593.

---------

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 .github/pre-commit/spelling_allowlist.txt     |   1 +
 python/cudaq/kernel/analysis.py               |   9 +-
 python/cudaq/kernel/kernel_decorator.py       |  12 +-
 python/cudaq/kernel/utils.py                  |  58 +++++++
 .../tests/kernel/test_kernel_repl_source.py   | 141 ++++++++++++++++++
 5 files changed, 210 insertions(+), 11 deletions(-)
 create mode 100644 python/tests/kernel/test_kernel_repl_source.py

diff --git a/.github/pre-commit/spelling_allowlist.txt b/.github/pre-commit/spelling_allowlist.txt
index 70ea6508c54..de285249423 100644
--- a/.github/pre-commit/spelling_allowlist.txt
+++ b/.github/pre-commit/spelling_allowlist.txt
@@ -128,6 +128,7 @@ QuTiP
 Quake
 Quantinuum
 RDMA
+REPL
 RHEL
 RPC
 RSA
diff --git a/python/cudaq/kernel/analysis.py b/python/cudaq/kernel/analysis.py
index 09e0c962b88..172cf27f681 100644
--- a/python/cudaq/kernel/analysis.py
+++ b/python/cudaq/kernel/analysis.py
@@ -11,6 +11,8 @@
 import textwrap
 from typing import Optional, Type
 
+from .utils import get_function_source_or_raise
+
 
 class FunctionDefVisitor(ast.NodeVisitor):
     """
@@ -108,7 +110,8 @@ def _getChildFuncNames(func_obj: object,
         if name is None:
             name = func_obj.__name__
 
-        tree = ast.parse(textwrap.dedent(inspect.getsource(func_obj)))
+        src, _ = get_function_source_or_raise(func_obj)
+        tree = ast.parse(src)
         vis = FindDepFuncsVisitor()
         visit_set.add(name)
         vis.visit(tree)
@@ -141,7 +144,9 @@ def fetch(func_obj: object):
             else:
                 this_func_obj = FetchDepFuncsSourceCode._getFuncObj(
                     funcName, callingFrame)
-            src = textwrap.dedent(inspect.getsource(this_func_obj))
+            if this_func_obj is None:
+                continue
+            src, _ = get_function_source_or_raise(this_func_obj)
 
             code += src + '\n'
 
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index 11a03e5d6fd..aea0b992974 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -24,7 +24,8 @@
 from .analysis import FunctionDefVisitor
 from .kernel_signature import CapturedLinkedKernel, CapturedVariable, KernelSignature
 from .ast_bridge import compile_to_mlir
-from .utils import (emitFatalError, emitErrorIfInvalidPauli, get_module_name,
+from .utils import (emitFatalError, emitErrorIfInvalidPauli,
+                    get_function_source_or_raise, get_module_name,
                     globalRegisteredTypes, mlirTypeFromPyType, mlirTypeToPyType,
                     nvqppPrefix, getMLIRContext, recover_func_op,
                     recover_value_of)
@@ -736,14 +737,7 @@ def isa_kernel_decorator(object):
 def _get_source(function):
     if function is None:
         return None, None
-    # Get the function source location
-    location = (inspect.getfile(function), inspect.getsourcelines(function)[1])
-    # Get the function source
-    src = inspect.getsource(function)
-    # Strip off the extra tabs
-    leadingSpaces = len(src) - len(src.lstrip())
-    src = '\n'.join([line[leadingSpaces:] for line in src.split('\n')])
-    return src, location
+    return get_function_source_or_raise(function)
 
 
 def _recover_defining_frame():
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index 2e1f4d0b0fb..47c1eb3dfb5 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -265,6 +265,64 @@ def emitWarning(msg):
                    Color.END + '\n\nOffending code:\n' + offendingSrc[0])
 
 
+def _format_missing_source_error(function, filename):
+    """
+    Build a user-facing diagnostic explaining why source for `function` could
+    not be retrieved. Distinguishes between three buckets:
+      - Interactive interpreter-defined (`<stdin>` or `<python-input-...>`).
+      - Other synthetic filenames (code compiled with a non-file name).
+      - Real paths that failed to read (missing file, frozen module,
+        compiled extension).
+    """
+    qualname = getattr(function, '__qualname__',
+                       getattr(function, '__name__', '<unknown>'))
+    if filename is None:
+        return (f"@cudaq.kernel could not determine a source location for "
+                f"function `{qualname}`. `@cudaq.kernel` requires source that "
+                f"Python's `inspect` module can recover. Move the kernel into "
+                f"a `.py` module.")
+    is_repl = filename == '<stdin>' or filename.startswith('<python-input')
+    is_synthetic = filename.startswith('<') and filename.endswith('>')
+    if is_repl:
+        return (f"@cudaq.kernel could not retrieve source for function "
+                f"`{qualname}` because it is defined in the Python REPL, "
+                f"which does not preserve source code that `inspect` can "
+                f"recover. To use `@cudaq.kernel`, either run from a "
+                f"Jupyter/IPython session (which preserves source via "
+                f"`linecache`) or move the kernel into a `.py` module.")
+    if is_synthetic:
+        return (f"@cudaq.kernel could not retrieve source for function "
+                f"`{qualname}`: it is defined in a non-file context "
+                f"(`{filename}`). `@cudaq.kernel` requires source that "
+                f"`inspect` can recover. Move the kernel into a `.py` "
+                f"module.")
+    return (f"@cudaq.kernel could not read source for function "
+            f"`{qualname}` at `{filename}` (the file may be missing, "
+            f"frozen, or a compiled extension).")
+
+
+def get_function_source_or_raise(function):
+    """
+    Return `(dedented_source, (filename, first_lineno))` for `function`.
+    Wraps `inspect.getfile`, `inspect.getsourcelines`, and
+    `inspect.getsource`. If any fail (most commonly because `function` was
+    defined in the interactive Python interpreter), raise `RuntimeError`
+    with a diagnostic
+    tailored to the failure mode, chained from the underlying exception.
+    """
+    filename = None
+    try:
+        filename = inspect.getfile(function)
+        first_line = inspect.getsourcelines(function)[1]
+        src = inspect.getsource(function)
+    except OSError as e:
+        raise RuntimeError(_format_missing_source_error(function,
+                                                        filename)) from e
+    leadingSpaces = len(src) - len(src.lstrip())
+    src = '\n'.join([line[leadingSpaces:] for line in src.split('\n')])
+    return src, (filename, first_line)
+
+
 def mlirTryCreateStructType(mlirEleTypes, name=None, context=None):
     """
     Creates either a `quake.StruqType` or a `cc.StructType` used to represent 
diff --git a/python/tests/kernel/test_kernel_repl_source.py b/python/tests/kernel/test_kernel_repl_source.py
new file mode 100644
index 00000000000..52bd0a37eb4
--- /dev/null
+++ b/python/tests/kernel/test_kernel_repl_source.py
@@ -0,0 +1,141 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+"""
+Regression tests for GitHub issue #2593: decorating a function with
+`@cudaq.kernel` in the standard Python REPL previously raised an opaque
+`OSError: could not get source code`. The fix replaces that with a
+`RuntimeError` whose message explains the cause and suggests workarounds.
+"""
+
+import linecache
+
+import pytest
+
+import cudaq
+from cudaq.kernel.analysis import FetchDepFuncsSourceCode
+from cudaq.kernel.utils import get_function_source_or_raise
+
+
+@pytest.fixture(autouse=True)
+def _clear_registries():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def _make_synthetic_function(src, name, filename):
+    """
+    Compile `src` with `filename` as its source-of-record (mimicking what
+    CPython does when it executes code typed at the REPL). Returns the
+    named callable from the resulting namespace.
+
+    The filename must not already be cached in `linecache` — otherwise
+    `inspect.getsource` could succeed unexpectedly and produce a false
+    negative for the tests below.
+    """
+    assert filename not in linecache.cache, (
+        f"linecache already has an entry for {filename!r}; pick a unique name")
+    code = compile(src, filename, 'exec')
+    ns = {}
+    exec(code, ns)
+    return ns[name]
+
+
+def test_repl_decoration_raises_clear_error():
+    """
+    Direct reproduction of issue #2593: a function with `<stdin>` as its
+    source filename cannot be compiled, but the error must name the
+    function and point at Jupyter/file workarounds instead of surfacing a
+    raw `OSError`.
+    """
+    fn = _make_synthetic_function(
+        "def my_repl_kernel(n: int):\n    pass\n",
+        name='my_repl_kernel',
+        filename='<stdin>',
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cudaq.kernel(fn)
+
+    msg = str(excinfo.value)
+    assert 'my_repl_kernel' in msg
+    assert 'REPL' in msg
+    assert 'Jupyter' in msg
+    # Original OSError preserved for debugging.
+    assert isinstance(excinfo.value.__cause__, OSError)
+
+
+def test_synthetic_filename_raises_non_repl_message():
+    """
+    A function whose source filename is synthetic but not the REPL
+    sentinel (e.g., `<generated>`) produces the non-file-context message,
+    not the REPL-specific one.
+    """
+    fn = _make_synthetic_function(
+        "def generated_kernel(n: int):\n    pass\n",
+        name='generated_kernel',
+        filename='<generated-test-src>',
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        get_function_source_or_raise(fn)
+
+    msg = str(excinfo.value)
+    assert 'generated_kernel' in msg
+    assert '<generated-test-src>' in msg
+    # Must not misidentify this as a REPL case.
+    assert 'REPL' not in msg
+
+
+def test_dep_fetch_raises_clear_error_for_repl_helper():
+    """
+    When a kernel calls a helper defined in the REPL, the dependency
+    fetcher in `analysis.py` must surface the same clear diagnostic,
+    naming the offending helper rather than blowing up with `OSError`.
+    """
+    repl_helper = _make_synthetic_function(
+        "def repl_helper(x: int) -> int:\n    return x + 1\n",
+        name='repl_helper',
+        filename='<python-input-1>',
+    )
+
+    def parent_kernel(x: int) -> int:
+        return repl_helper(x)
+
+    # Inject the helper into the calling frame's locals so
+    # FetchDepFuncsSourceCode can resolve it by name, then trigger the
+    # dep fetch. The failure happens inside analysis.py, not the decorator.
+    with pytest.raises(RuntimeError) as excinfo:
+        FetchDepFuncsSourceCode.fetch(parent_kernel)
+
+    msg = str(excinfo.value)
+    assert 'repl_helper' in msg
+    assert 'REPL' in msg
+    assert isinstance(excinfo.value.__cause__, OSError)
+
+
+def test_normal_function_still_compiles():
+    """
+    Regression guard: ensure the error-path wrapping did not break the
+    ordinary success path. A kernel defined in this test file (which
+    `inspect.getsource` can read) must compile without raising.
+    """
+
+    @cudaq.kernel
+    def bell_pair():
+        q = cudaq.qvector(2)
+        h(q[0])
+        x.ctrl(q[0], q[1])
+
+    result = cudaq.sample(bell_pair, shots_count=100)
+    # The test passes if decoration and sampling succeed; specific counts
+    # are irrelevant here.
+    assert result is not None
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From 8c33f0acd9da4190a1d0b7c6e3494f85c365ca39 Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Wed, 22 Apr 2026 22:47:05 -0700
Subject: [PATCH 43/85] [Python] AST bridge: reject cudaq.dbg.ast aliases and
 reorderings (#2342) (#4310)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Description:
  `cudaq.ast` is defined as a lazy alias for `cudaq.dbg.ast` in
`_LAZY_SUBMODULES`. Because `resolveQualifiedName` returns
`obj.__name__`
after traversing the attribute chain, `cudaq.ast.print_i64(n)` was
silently accepted inside kernels — `cudaq.ast` resolves to
`cudaq.dbg.ast`
at runtime, so `obj.__name__` comes back as `'cudaq.dbg.ast'` and passes
the `devKey` guard.
Fix: add `isExactCudaqDbgAstCall` which walks the literal AST node
structure and requires the chain to be exactly
`<cudaq_alias>.dbg.ast.<name>`.
The `devKey` check is now conjunctive with this AST check.
Tests cover:
- valid `cudaq.dbg.ast.print_i64` and `print_f64` calls still work
- `cudaq.ast.print_i64` (lazy alias, the specific bug) is now rejected
- `dbg.cudaq.ast` and `ast.cudaq.dbg` reorderings are rejected
Fixes #2342

---------

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 python/cudaq/kernel/ast_bridge.py           | 24 ++++++++-
 python/tests/kernel/test_kernel_features.py | 60 +++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index f4de7021e01..17c09b5925f 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2672,11 +2672,33 @@ def checkModule(obj, moduleNames):
                     node.func.value.id) and node.func.attr == 'kernel':
                 return
 
+            def isExactCudaqDbgAstCall(func_node: ast.AST) -> bool:
+                """Return True iff `func_node` is the exact AST shape for
+                ``<cudaq_alias>.dbg.ast.<name>``.
+
+                Runtime attribute lookup follows lazy aliases (e.g. ``cudaq.ast``
+                resolves to ``cudaq.dbg.ast`` via ``_LAZY_SUBMODULES``), so
+                `devKey` is not a sufficient check. Walk the literal node
+                structure instead."""
+                if not isinstance(func_node, ast.Attribute):
+                    return False
+                if not isinstance(
+                        func_node.value,
+                        ast.Attribute) or func_node.value.attr != 'ast':
+                    return False
+                if not isinstance(
+                        func_node.value.value,
+                        ast.Attribute) or func_node.value.value.attr != 'dbg':
+                    return False
+                root = func_node.value.value.value
+                return isinstance(root, ast.Name) and self.isCudaqName(root.id)
+
             devKey, name = resolveQualifiedName(node.func)
             if devKey:
 
                 # Handle debug functions
-                if devKey == 'cudaq.dbg.ast':
+                if devKey == 'cudaq.dbg.ast' and isExactCudaqDbgAstCall(
+                        node.func):
                     # Handle a debug print statement
                     arg = self.__groupValues(node.args, [1])
                     self.__insertDbgStmt(arg, name)
diff --git a/python/tests/kernel/test_kernel_features.py b/python/tests/kernel/test_kernel_features.py
index b3b99086028..856c2ea8311 100644
--- a/python/tests/kernel/test_kernel_features.py
+++ b/python/tests/kernel/test_kernel_features.py
@@ -570,6 +570,66 @@ def test2(myList: List[int]):
     assert '1010' in counts
 
 
+def test_dbg_ast_strict_path():
+    """Only cudaq.dbg.ast.print_i64/f64 is valid inside a kernel.
+
+    cudaq.ast is a lazy alias for cudaq.dbg.ast (via _LAZY_SUBMODULES), so
+    without an explicit AST structure check it would pass the devKey guard.
+    Component reorderings like dbg.cudaq.ast are also rejected.
+
+    See https://github.com/NVIDIA/cuda-quantum/issues/2342
+    """
+
+    @cudaq.kernel
+    def valid_kernel(n: int, f: float):
+        q = cudaq.qvector(n)
+        h(q[0])
+        cudaq.dbg.ast.print_i64(n)
+        cudaq.dbg.ast.print_f64(f)
+        mz(q)
+
+    counts = cudaq.sample(valid_kernel, 2, 2.0)
+    assert len(counts) > 0
+
+    # cudaq.ast resolves to cudaq.dbg.ast at runtime via _LAZY_SUBMODULES;
+    # isExactCudaqDbgAstCall rejects it because the AST node chain is wrong.
+    with pytest.raises(RuntimeError):
+
+        @cudaq.kernel
+        def invalid_cudaq_ast(n: int):
+            q = cudaq.qvector(n)
+            h(q[0])
+            cudaq.ast.print_i64(n)
+            mz(q)
+
+        cudaq.sample(invalid_cudaq_ast, 2)
+
+    # dbg.cudaq.ast - resolveQualifiedName returns 'dbg.cudaq.ast', which
+    # never matches the devKey guard.
+    with pytest.raises(RuntimeError):
+
+        @cudaq.kernel
+        def invalid_dbg_cudaq_ast(n: int):
+            q = cudaq.qvector(n)
+            h(q[0])
+            dbg.cudaq.ast.print_i64(n)
+            mz(q)
+
+        cudaq.sample(invalid_dbg_cudaq_ast, 2)
+
+    # ast.cudaq.dbg - same: resolveQualifiedName returns 'ast.cudaq.dbg'.
+    with pytest.raises(RuntimeError):
+
+        @cudaq.kernel
+        def invalid_ast_cudaq_dbg(n: int):
+            q = cudaq.qvector(n)
+            h(q[0])
+            ast.cudaq.dbg.print_i64(n)
+            mz(q)
+
+        cudaq.sample(invalid_ast_cudaq_dbg, 2)
+
+
 def test_no_dynamic_Lists():
     with pytest.raises(RuntimeError) as error:
 

From b2e4ee9b3f181171fa623cdd318bcc30cf2b210f Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Wed, 22 Apr 2026 23:07:03 -0700
Subject: [PATCH 44/85] Adding coverage for if condition in a list (#4375)

- Lower if`-filtered list comprehensions (handles chained if) in Python
kernel AST bridge
- Works for int/bool element types and qubit-ref results
- Adds `test_list_comprehension_filter` covering constant, variable,
captured-list, chained-if, and control-qubit cases

Fixes #4207

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 python/cudaq/kernel/ast_bridge.py           | 142 ++++++++++++++------
 python/tests/mlir/ast_list_comprehension.py |  91 +++++++++++++
 2 files changed, 192 insertions(+), 41 deletions(-)

diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 17c09b5925f..f568867381a 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -3819,13 +3819,17 @@ def visit_ListComp(self, node):
         the MLIR.
 
         By simple, we mean expressions like `[expr(iter) for iter in iterable]`
-        or `myList = [exprThatReturns(iter) for iter in iterable]`.
+        or `myList = [exprThatReturns(iter) for iter in iterable]`, optionally
+        with `if` filter clause.
         """
         if len(node.generators) > 1:
             self.emitFatalError(
                 "CUDA-Q only supports single generators for list comprehension.",
                 node)
 
+        if_clauses = node.generators[0].ifs
+        hasFilter = len(if_clauses) > 0
+
         self.visit(node.generators[0].iter)
         iterable = self.popValue()
         orig_iterable_type = iterable.type
@@ -3864,6 +3868,15 @@ def process_void_list():
             # `visit_For`, but that would be premature optimization.
             self.visit_For(forNode)
 
+        def evalFilter():
+            cond = None
+            for if_node in if_clauses:
+                self.visit(if_node)
+                this_cond = self.__arithmetic_to_bool(self.popValue())
+                cond = this_cond if cond is None else arith.AndIOp(
+                    cond, this_cond).result
+            return cond
+
         target_types = {}
 
         def get_target_type(target, targetType):
@@ -4062,10 +4075,11 @@ def get_item_type(pyval):
             return
 
         if quake.RefType.isinstance(listElemTy):
-            if quake.VeqType.isinstance(orig_iterable_type):
+            if quake.VeqType.isinstance(orig_iterable_type) and not hasFilter:
                 self.pushValue(iterable)
                 return
-            if cc.StdvecType.isinstance(orig_iterable_type):
+            if (cc.StdvecType.isinstance(orig_iterable_type) or
+                    quake.VeqType.isinstance(orig_iterable_type)):
                 i64Ty = self.getIntegerType()
                 veqTy = self.getVeqType()
                 c0 = self.getConstantInt(0)
@@ -4078,18 +4092,39 @@ def get_item_type(pyval):
 
                 def bodyBuilder(args):
                     i, curr_veq = args[0], args[1]
-                    elem_addr = cc.ComputePtrOp(
-                        cc.PointerType.get(iterTy), iterable, [i],
-                        DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                              context=self.ctx))
-                    idx_val = cc.LoadOp(elem_addr).result
+                    if quake.VeqType.isinstance(iterable.type):
+                        idx_val = quake.ExtractRefOp(iterTy,
+                                                     iterable,
+                                                     -1,
+                                                     index=i).result
+                    else:
+                        elem_addr = cc.ComputePtrOp(
+                            cc.PointerType.get(iterTy), iterable, [i],
+                            DenseI32ArrayAttr.get([kDynamicPtrIndex],
+                                                  context=self.ctx))
+                        idx_val = cc.LoadOp(elem_addr).result
                     self.symbolTable.beginBlock()
                     self.__deconstructAssignment(node.generators[0].target,
                                                  idx_val)
-                    self.visit(node.elt)
-                    ref = self.popValue()
+                    if hasFilter:
+                        cond = evalFilter()
+                        ifOp = cc.IfOp([veqTy], cond, [])
+                        thenBlock = Block.create_at_start(ifOp.thenRegion, [])
+                        with InsertionPoint(thenBlock):
+                            self.visit(node.elt)
+                            ref = self.popValue()
+                            appended = quake.ConcatOp(veqTy,
+                                                      [curr_veq, ref]).result
+                            cc.ContinueOp([appended])
+                        elseBlock = Block.create_at_start(ifOp.elseRegion, [])
+                        with InsertionPoint(elseBlock):
+                            cc.ContinueOp([curr_veq])
+                        new_veq = ifOp.result
+                    else:
+                        self.visit(node.elt)
+                        ref = self.popValue()
+                        new_veq = quake.ConcatOp(veqTy, [curr_veq, ref]).result
                     self.symbolTable.endBlock()
-                    new_veq = quake.ConcatOp(veqTy, [curr_veq, ref]).result
                     cc.ContinueOp([i, new_veq])
 
                 loop = self.createForLoop(
@@ -4111,47 +4146,72 @@ def bodyBuilder(args):
                                 TypeAttr.get(listElemTy),
                                 seqSize=iterableSize).result
 
-        # General case of
-        # `listVar = [expr(i) for i in iterable]`
-        # Need to think of this as
-        # `listVar = stdvec(iterable.size)`
-        # `for i, r in enumerate(listVar):`
-        # `   listVar[i] = expr(r)`
-        def bodyBuilder(iterVar):
-            self.symbolTable.beginBlock()
+        def extractIterVal(iterVar):
             if quake.VeqType.isinstance(iterable.type):
-                iterVal = quake.ExtractRefOp(iterTy,
-                                             iterable,
-                                             -1,
-                                             index=iterVar).result
-            else:
-                eleAddr = cc.ComputePtrOp(
-                    cc.PointerType.get(iterTy), iterable, [iterVar],
-                    DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
-                iterVal = cc.LoadOp(eleAddr).result
-
-            # We don't do support anything within list comprehensions that would
-            # require being careful about assigning references, so simply
-            # adding them to the symbol table is enough for list comprehension.
-            self.__deconstructAssignment(node.generators[0].target, iterVal)
+                return quake.ExtractRefOp(iterTy, iterable, -1,
+                                          index=iterVar).result
+            eleAddr = cc.ComputePtrOp(
+                cc.PointerType.get(iterTy), iterable, [iterVar],
+                DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
+            return cc.LoadOp(eleAddr).result
+
+        def storeElementAt(storeIdx):
             self.visit(node.elt)
             element = self.popValue()
-            # We do need to be careful, however, about validating the list
-            # elements.
+            # We do need to be careful about validating the list elements.
             self.__validate_container_entry(element, node.elt)
-
             listValueAddr = cc.ComputePtrOp(
-                cc.PointerType.get(listElemTy), listValue, [iterVar],
+                cc.PointerType.get(listElemTy), listValue, [storeIdx],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
             element = self.changeOperandToType(listElemTy,
                                                element,
                                                allowDemotion=False)
             cc.StoreOp(element, listValueAddr)
-            self.symbolTable.endBlock()
 
-        self.createInvariantForLoop(bodyBuilder, iterableSize)
-        res = cc.StdvecInitOp(resultVecTy, listValue,
-                              length=iterableSize).result
+        if not hasFilter:
+
+            def bodyBuilder(iterVar):
+                self.symbolTable.beginBlock()
+                iterVal = extractIterVal(iterVar)
+                self.__deconstructAssignment(node.generators[0].target, iterVal)
+                storeElementAt(iterVar)
+                self.symbolTable.endBlock()
+
+            self.createInvariantForLoop(bodyBuilder, iterableSize)
+            res = cc.StdvecInitOp(resultVecTy, listValue,
+                                  length=iterableSize).result
+            self.pushValue(res)
+            return
+
+        i64Ty = self.getIntegerType()
+        c0 = self.getConstantInt(0)
+        c1 = self.getConstantInt(1)
+
+        def filteredBodyBuilder(args):
+            i, count = args[0], args[1]
+            self.symbolTable.beginBlock()
+            iterVal = extractIterVal(i)
+            self.__deconstructAssignment(node.generators[0].target, iterVal)
+            cond = evalFilter()
+            ifOp = cc.IfOp([i64Ty], cond, [])
+            thenBlock = Block.create_at_start(ifOp.thenRegion, [])
+            with InsertionPoint(thenBlock):
+                storeElementAt(count)
+                cc.ContinueOp([arith.AddIOp(count, c1).result])
+            elseBlock = Block.create_at_start(ifOp.elseRegion, [])
+            with InsertionPoint(elseBlock):
+                cc.ContinueOp([count])
+            nextCount = ifOp.result
+            self.symbolTable.endBlock()
+            cc.ContinueOp([i, nextCount])
+
+        loop = self.createForLoop(
+            [i64Ty, i64Ty],
+            filteredBodyBuilder, [c0, c0], lambda args: arith.CmpIOp(
+                IntegerAttr.get(i64Ty, 2), args[0], iterableSize).result,
+            lambda args: [arith.AddIOp(args[0], c1).result, args[1]])
+        finalCount = loop.results[1]
+        res = cc.StdvecInitOp(resultVecTy, listValue, length=finalCount).result
         self.pushValue(res)
         return
 
diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py
index 56ebce4b87b..b220d5272a6 100644
--- a/python/tests/mlir/ast_list_comprehension.py
+++ b/python/tests/mlir/ast_list_comprehension.py
@@ -1135,6 +1135,97 @@ def kernel6(qs: cudaq.qvector, indices: list[int]):
 # CHECK: return
 
 
+def test_list_comprehension_filter():
+    print("test_list_comprehension_filter:")
+
+    @cudaq.kernel
+    def kernel1() -> list[int]:
+        return [x for x in range(16) if False]
+
+    out = cudaq.run(kernel1, shots_count=1)
+    assert len(out) == 1 and out[0] == []
+    print(kernel1)
+
+    @cudaq.kernel
+    def kernel2(mask: int) -> list[int]:
+        return [x for x in range(8) if ((1 << x) & mask) != 0]
+
+    out = cudaq.run(kernel2, 0b10011, shots_count=1)
+    assert len(out) == 1 and out[0] == [0, 1, 4]
+    print(kernel2)
+
+    @cudaq.kernel
+    def kernel3() -> list[int]:
+        vals = [1, 2, 3, 4, 5]
+        return [v for v in vals if v % 2 == 0]
+
+    out = cudaq.run(kernel3, shots_count=1)
+    assert len(out) == 1 and out[0] == [2, 4]
+    print(kernel3)
+
+    @cudaq.kernel
+    def kernel4() -> list[int]:
+        return [x for x in range(10) if x > 2 if x < 7]
+
+    out = cudaq.run(kernel4, shots_count=1)
+    assert len(out) == 1 and out[0] == [3, 4, 5, 6]
+    print(kernel4)
+
+    @cudaq.kernel
+    def kernel5():
+        qs = cudaq.qvector(4)
+        flips = [qs[i] for i in range(4) if i % 2 == 1]
+        x(flips)
+
+    out = cudaq.sample(kernel5)
+    assert len(out) == 1 and '0101' in out
+    print(kernel5)
+
+    @cudaq.kernel
+    def kernel6(mask: int):
+        qs = cudaq.qvector(4)
+        x(qs)
+        target = cudaq.qubit()
+        x.ctrl([qs[i] for i in range(4) if ((1 << i) & mask) != 0], target)
+
+    out = cudaq.sample(kernel6, 0b1001)
+    assert len(out) == 1 and '11111' in out
+    print(kernel6)
+
+
+# CHECK-LABEL: test_list_comprehension_filter:
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel3..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel4..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: cc.stdvec_init
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel5..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: quake.concat
+# CHECK: return
+# CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel6..
+# CHECK: cc.loop
+# CHECK: cc.if
+# CHECK: quake.concat
+# CHECK: return
+
+
 def test_list_comprehension_failures():
     print("test_list_comprehension_failures:")
     try:

From d25183a51d84e009cb6d9105069577cfba5edd89 Mon Sep 17 00:00:00 2001
From: Renaud Kauffmann <rkauffmann@nvidia.com>
Date: Wed, 22 Apr 2026 23:47:42 -0700
Subject: [PATCH 45/85] [NFC] Breaking up Compiler.cpp to isolate common code
 from policy specific code. (#4367)

The goal of this PR is to prepare `lowerQuakeCode` and `runPassPipeline`
to be overloaded based on the policies introduced in #4339. Practically
speaking, the code located between `ctx->name == "..."` has been moved
into utility methods which do not depend on the `executionContext`.

5 private helper methods have been extracted.
1. The lambda `runPassPipeline` has been renamed to `applyPipeline` as
to not be confused with the new `runPassPipeline` method and is now a
method as it is being called from several other methods
2. `prepareModule` builds the module via lowerQuakeCodeBuildModule,
merges closures, performs argument/quake synthesis.

3. `executeMainPipeline` handles the combine-measurements delay for
emulation, then runs the main pass pipeline. Returns whether
combine-measurements was delayed.

4. `extractMappingReorderIdx` extracts qubit-mapping reorder indices
from entry-point attributes. Moved into the anonymous namespace.

5. `assembleCompiledModule` creates JIT artifacts (when needed), runs
deferred combine-measurements, creates MLIR artifacts, and assembles the
final CompiledModule.

---------

Signed-off-by: Renaud Kauffmann <rkauffmann@nvidia.com>
---
 runtime/internal/compiler/Compiler.cpp        | 213 ++++++++++--------
 .../cudaq_internal/compiler/Compiler.h        |  25 ++
 2 files changed, 149 insertions(+), 89 deletions(-)

diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index e94cf2a5fab..0bfe10bddb6 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -90,6 +90,22 @@ nlohmann::json formOutputNames(const std::string &codegenTranslation,
   }
   return output_names;
 }
+/// Extract qubit-mapping reorder indices from the entry-point attributes.
+std::vector<std::size_t> extractMappingReorderIdx(mlir::ModuleOp moduleOp,
+                                                  mlir::func::FuncOp epFunc) {
+  assert(moduleOp.template lookupSymbol<mlir::func::FuncOp>(epFunc.getName()) &&
+         "Entry point function must survive the lowering pipeline.");
+  std::vector<std::size_t> mapping_reorder_idx;
+  if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
+          epFunc->getAttr("mapping_reorder_idx"))) {
+    mapping_reorder_idx.resize(mappingAttr.size());
+    std::transform(mappingAttr.begin(), mappingAttr.end(),
+                   mapping_reorder_idx.begin(), [](mlir::Attribute attr) {
+                     return mlir::cast<mlir::IntegerAttr>(attr).getInt();
+                   });
+  }
+  return mapping_reorder_idx;
+}
 } // namespace
 
 std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
@@ -224,39 +240,41 @@ Compiler::Compiler(cudaq::ServerHelper *serverHelper,
 
 Compiler::~Compiler() = default;
 
-cudaq::CompiledModule Compiler::runPassPipeline(
-    cudaq::ExecutionContext *executionContext, const std::string &kernelName,
-    mlir::ModuleOp m_module, const std::vector<void *> &rawArgs,
-    void *kernelArgs, std::shared_ptr<mlir::MLIRContext> context) {
-  auto contextPtr = m_module.getContext();
-  assert(!context || context.get() == contextPtr);
+// =============================================================================
+// Common helpers for policy-specific runPassPipeline overloads
+// =============================================================================
+
+void Compiler::applyPipeline(const std::string &pipeline,
+                             mlir::ModuleOp moduleOp,
+                             const std::string &kernelName) {
+  auto *contextPtr = moduleOp.getContext();
+  mlir::PassManager pm(contextPtr);
+  std::string errMsg;
+  llvm::raw_string_ostream os(errMsg);
+  CUDAQ_INFO("Pass pipeline for {} = {}", kernelName, pipeline);
+  if (failed(parsePassPipeline(pipeline, pm, os)))
+    throw std::runtime_error(
+        "Remote rest platform failed to add passes to pipeline (" + errMsg +
+        ").");
+  if (disableMLIRthreading || enablePrintMLIREachPass)
+    contextPtr->disableMultithreading();
+  if (enablePrintMLIREachPass)
+    pm.enableIRPrinting();
+  if (failed(pm.run(moduleOp)))
+    throw std::runtime_error("Remote rest platform Quake lowering failed.");
+}
+
+std::pair<mlir::ModuleOp, mlir::func::FuncOp>
+Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
+                        const std::vector<void *> &rawArgs, void *kernelArgs) {
+  auto *contextPtr = m_module.getContext();
 
-  // Extract the kernel name
   auto origFn = m_module.template lookupSymbol<mlir::func::FuncOp>(
       std::string(cudaq::runtime::cudaqGenPrefixName) + kernelName);
 
   auto moduleOp =
       lowerQuakeCodeBuildModule(kernelName, m_module, contextPtr, origFn);
 
-  // Lambda to apply a specific pipeline to the given ModuleOp
-  auto runPassPipeline = [&](const std::string &pipeline,
-                             mlir::ModuleOp moduleOpIn) {
-    mlir::PassManager pm(contextPtr);
-    std::string errMsg;
-    llvm::raw_string_ostream os(errMsg);
-    CUDAQ_INFO("Pass pipeline for {} = {}", kernelName, pipeline);
-    if (failed(parsePassPipeline(pipeline, pm, os)))
-      throw std::runtime_error(
-          "Remote rest platform failed to add passes to pipeline (" + errMsg +
-          ").");
-    if (disableMLIRthreading || enablePrintMLIREachPass)
-      moduleOpIn.getContext()->disableMultithreading();
-    if (enablePrintMLIREachPass)
-      pm.enableIRPrinting();
-    if (failed(pm.run(moduleOpIn)))
-      throw std::runtime_error("Remote rest platform Quake lowering failed.");
-  };
-
   auto epFunc =
       moduleOp.template lookupSymbol<mlir::func::FuncOp>(origFn.getName());
   const bool isPython = moduleOp->hasAttr(cudaq::runtime::pythonUniqueAttrName);
@@ -327,8 +345,72 @@ cudaq::CompiledModule Compiler::runPassPipeline(
       throw std::runtime_error("Could not successfully apply quake-synth.");
   }
 
+  return {moduleOp, epFunc};
+}
+
+bool Compiler::executeMainPipeline(mlir::ModuleOp moduleOp,
+                                   const std::string &kernelName) {
+  auto combineMeasurements =
+      passPipelineConfig.find("combine-measurements") != std::string::npos;
+  if (emulate && combineMeasurements) {
+    std::regex combine("(.*),([ ]*)combine-measurements(.*)");
+    std::string replacement("$1$3");
+    passPipelineConfig =
+        std::regex_replace(passPipelineConfig, combine, replacement);
+    CUDAQ_INFO("Delaying combine-measurements pass due to emulation. "
+               "Updating pipeline to {}",
+               passPipelineConfig);
+  }
+  applyPipeline(passPipelineConfig, moduleOp, kernelName);
+  return combineMeasurements;
+}
+
+cudaq::CompiledModule Compiler::assembleCompiledModule(
+    const std::string &kernelName,
+    std::vector<std::pair<std::string, mlir::ModuleOp>> &modules, bool needJit,
+    bool runCombineMeasurements, std::optional<cudaq::Resources> resourceCounts,
+    const std::vector<std::size_t> &mappingReorderIdx,
+    std::shared_ptr<mlir::MLIRContext> context) {
+  std::vector<CompiledModuleHelper::NamedCompiledArtifact> artifacts;
+  if (needJit) {
+    for (auto &[name, module] : modules) {
+      auto clonedModule = module.clone();
+      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
+          kernelName, createJITEngine(clonedModule, codegenTranslation), {},
+          /*isFullySpecialized=*/true);
+      assert(jitArtifacts.size() == 1);
+      jitArtifacts[0].first = name;
+      artifacts.push_back(std::move(jitArtifacts[0]));
+      if (resourceCounts)
+        artifacts.push_back(CompiledModuleHelper::createResourcesArtifact(
+            name + ".resources", std::move(*resourceCounts)));
+    }
+  }
+
+  if (runCombineMeasurements)
+    for (auto &[name, module] : modules)
+      applyPipeline("func.func(combine-measurements)", module, kernelName);
+
+  for (auto &[name, module] : modules) {
+    auto mlirName = name + ".mlir";
+    artifacts.push_back(
+        CompiledModuleHelper::createMlirArtifact(mlirName, module, context));
+  }
+
+  return CompiledModuleHelper::createCompiledModule(
+      kernelName, {}, std::move(artifacts), {.reorderIdx = mappingReorderIdx});
+}
+
+cudaq::CompiledModule Compiler::runPassPipeline(
+    cudaq::ExecutionContext *executionContext, const std::string &kernelName,
+    mlir::ModuleOp m_module, const std::vector<void *> &rawArgs,
+    void *kernelArgs, std::shared_ptr<mlir::MLIRContext> context) {
+  assert(!context || context.get() == m_module.getContext());
+  auto [moduleOp, epFunc] =
+      prepareModule(kernelName, m_module, rawArgs, kernelArgs);
+
+  // Populate conditional measurement flag in the context.
   if (emulate && executionContext && executionContext->name == "sample") {
-    // Populate conditional measurement flag in the context.
     for (auto &artifact : moduleOp) {
       quake::detail::QuakeFunctionAnalysis analysis{&artifact};
       auto info = analysis.getAnalysisInfo();
@@ -347,21 +429,9 @@ cudaq::CompiledModule Compiler::runPassPipeline(
       }
     }
   }
-  // Delay combining measurements for backends that cannot handle
-  // subveqs and multiple measurements until we created the emulation code.
-  auto combineMeasurements =
-      passPipelineConfig.find("combine-measurements") != std::string::npos;
-  if (emulate && combineMeasurements) {
-    std::regex combine("(.*),([ ]*)combine-measurements(.*)");
-    std::string replacement("$1$3");
-    passPipelineConfig =
-        std::regex_replace(passPipelineConfig, combine, replacement);
-    CUDAQ_INFO("Delaying combine-measurements pass due to emulation. "
-               "Updating pipeline to {}",
-               passPipelineConfig);
-  }
 
-  runPassPipeline(passPipelineConfig, moduleOp);
+  bool combineMeasurements = executeMainPipeline(moduleOp, kernelName);
+
   // We need to run resource counting preprocessing after the pass pipeline as
   // the pre-processing might change the IR structure (may interfere with
   // other passes).
@@ -374,17 +444,7 @@ cudaq::CompiledModule Compiler::runPassPipeline(
     resourceCounts = std::move(*result);
   }
 
-  assert(moduleOp.template lookupSymbol<mlir::func::FuncOp>(epFunc.getName()) &&
-         "Entry point function must survive the lowering pipeline.");
-  std::vector<std::size_t> mapping_reorder_idx;
-  if (auto mappingAttr = dyn_cast_if_present<mlir::ArrayAttr>(
-          epFunc->getAttr("mapping_reorder_idx"))) {
-    mapping_reorder_idx.resize(mappingAttr.size());
-    std::transform(mappingAttr.begin(), mappingAttr.end(),
-                   mapping_reorder_idx.begin(), [](mlir::Attribute attr) {
-                     return mlir::cast<mlir::IntegerAttr>(attr).getInt();
-                   });
-  }
+  auto mapping_reorder_idx = extractMappingReorderIdx(moduleOp, epFunc);
 
   if (executionContext) {
     if (executionContext->name == "sample") {
@@ -416,17 +476,17 @@ cudaq::CompiledModule Compiler::runPassPipeline(
       }
       // No need to add measurements only to remove them eventually
       if (postCodeGenPasses.find("remove-measurements") == std::string::npos)
-        runPassPipeline("func.func(add-measurements)", moduleOp);
+        applyPipeline("func.func(add-measurements)", moduleOp, kernelName);
     } else {
       executionContext->reorderIdx.clear();
     }
   }
 
-  std::vector<std::pair<std::string, mlir::ModuleOp>> modules;
   // Apply observations if necessary
+  std::vector<std::pair<std::string, mlir::ModuleOp>> modules;
   if (executionContext && executionContext->name == "observe") {
     mapping_reorder_idx.clear();
-    runPassPipeline("canonicalize,cse", moduleOp);
+    applyPipeline("canonicalize,cse", moduleOp, kernelName);
     cudaq::spin_op &spin = executionContext->spin.value();
     for (const auto &term : spin) {
       if (term.is_identity())
@@ -443,6 +503,7 @@ cudaq::CompiledModule Compiler::runPassPipeline(
 
       // Create the pass manager, add the quake observe ansatz pass and run it
       // followed by the canonicalizer
+      auto *contextPtr = moduleOp.getContext();
       mlir::PassManager pm(contextPtr);
       pm.addNestedPass<mlir::func::FuncOp>(cudaq::opt::createObserveAnsatzPass(
           term.get_binary_symplectic_form()));
@@ -459,47 +520,21 @@ cudaq::CompiledModule Compiler::runPassPipeline(
       auto csvSplit = cudaq::split(passPipelineConfig, ',');
       for (auto &pass : csvSplit)
         if (pass.ends_with("-gate-set-mapping"))
-          runPassPipeline(pass, tmpModuleOp);
+          applyPipeline(pass, tmpModuleOp, kernelName);
       if (!emulate && combineMeasurements)
-        runPassPipeline("func.func(combine-measurements)", tmpModuleOp);
+        applyPipeline("func.func(combine-measurements)", tmpModuleOp,
+                      kernelName);
       modules.emplace_back(term.get_term_id(), tmpModuleOp);
     }
   } else {
     modules.emplace_back(kernelName, moduleOp);
   }
 
-  // For emulation or resource counting: create JIT artifacts before
-  // applying combine-measurements (so the JIT sees un-combined measurements).
-  std::vector<CompiledModuleHelper::NamedCompiledArtifact> artifacts;
-  if (emulate ||
-      (executionContext && executionContext->name == "resource-count")) {
-    for (auto &[name, module] : modules) {
-      auto clonedModule = module.clone();
-      auto jitArtifacts = CompiledModuleHelper::createJitArtifacts(
-          kernelName, createJITEngine(clonedModule, codegenTranslation), {},
-          /*isFullySpecialized=*/true);
-      assert(jitArtifacts.size() == 1);
-      jitArtifacts[0].first = name;
-      artifacts.push_back(std::move(jitArtifacts[0]));
-      if (resourceCounts)
-        artifacts.push_back(CompiledModuleHelper::createResourcesArtifact(
-            name + ".resources", std::move(*resourceCounts)));
-    }
-  }
-
-  if (emulate && combineMeasurements)
-    for (auto &[name, module] : modules)
-      runPassPipeline("func.func(combine-measurements)", module);
-
-  for (auto &[name, module] : modules) {
-    auto mlirName = name + ".mlir"; // distinguish MLIR and JIT artifacts
-    artifacts.push_back(
-        CompiledModuleHelper::createMlirArtifact(mlirName, module, context));
-  }
-
-  return CompiledModuleHelper::createCompiledModule(
-      kernelName, {}, std::move(artifacts),
-      {.reorderIdx = mapping_reorder_idx});
+  bool needJit = emulate || (executionContext &&
+                             executionContext->name == "resource-count");
+  return assembleCompiledModule(
+      kernelName, modules, needJit, emulate && combineMeasurements,
+      std::move(resourceCounts), mapping_reorder_idx, context);
 }
 
 std::vector<cudaq::KernelExecution>
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
index d2f3e12c2d8..6f00735d966 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
@@ -77,6 +77,31 @@ class Compiler {
                                            mlir::MLIRContext *,
                                            mlir::func::FuncOp);
 
+  // ---- Common helpers used by runPassPipeline ----
+
+  /// Run an arbitrary MLIR pass pipeline string on a module.
+  void applyPipeline(const std::string &pipeline, mlir::ModuleOp moduleOp,
+                     const std::string &kernelName);
+
+  /// Build the module, merge closures, and synthesize arguments.
+  std::pair<mlir::ModuleOp, mlir::func::FuncOp>
+  prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
+                const std::vector<void *> &rawArgs, void *kernelArgs);
+
+  /// Delay combine-measurements for emulation, then run the main pass
+  /// pipeline.  Returns true when combine-measurements was delayed.
+  bool executeMainPipeline(mlir::ModuleOp moduleOp,
+                           const std::string &kernelName);
+
+  /// Create JIT and MLIR artifacts and assemble a CompiledModule.
+  cudaq::CompiledModule assembleCompiledModule(
+      const std::string &kernelName,
+      std::vector<std::pair<std::string, mlir::ModuleOp>> &modules,
+      bool needJit, bool runCombineMeasurements,
+      std::optional<cudaq::Resources> resourceCounts,
+      const std::vector<std::size_t> &mappingReorderIdx,
+      std::shared_ptr<mlir::MLIRContext> context);
+
 public:
   Compiler(cudaq::ServerHelper *,
            const std::map<std::string, std::string> &backendConfig,

From e43b46d1f716475263eec6e563e33e7c720534dd Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Thu, 23 Apr 2026 07:22:47 +0000
Subject: [PATCH 46/85] add: error handling for status codes in result fetch
 and corresponding tests

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 python/tests/backends/test_qbraid.py          | 59 +++++++++++++-
 .../helpers/qbraid/QbraidServerHelper.cpp     | 32 +++++++-
 unittests/backends/qbraid/QbraidTester.cpp    | 76 +++++++++++++++++++
 utils/mock_qpu/qbraid/__init__.py             | 29 +++++++
 4 files changed, 194 insertions(+), 2 deletions(-)

diff --git a/python/tests/backends/test_qbraid.py b/python/tests/backends/test_qbraid.py
index c42cc3e1a02..8aa9b0dff57 100644
--- a/python/tests/backends/test_qbraid.py
+++ b/python/tests/backends/test_qbraid.py
@@ -8,6 +8,7 @@
 
 import os
 from multiprocessing import Process
+from urllib.request import Request, urlopen
 
 import cudaq
 import pytest
@@ -20,7 +21,7 @@
     print("Mock qpu not available, skipping qBraid tests.")
     pytest.skip("Mock qpu not available.", allow_module_level=True)
 
-port = 62452
+port = 62454
 
 # Default machine for tests. Mirrors the real qBraid device string format.
 TEST_MACHINE = "qbraid:qbraid:sim:qir-sv"
@@ -177,6 +178,62 @@ def test_qbraid_machine_alternative_device():
     assert len(counts) >= 1
 
 
+def _arm_result_status(code: int):
+    """Force the next /result call on the mock to return the given HTTP code.
+
+    Resets prior test-hook state first so the test is order-independent.
+    """
+    reset_url = f"http://localhost:{port}/test/reset"
+    arm_url = f"http://localhost:{port}/test/force_next_result_status/{code}"
+    # POST with empty body; no response parsing needed.
+    urlopen(Request(reset_url, data=b"", method="POST"), timeout=5).read()
+    urlopen(Request(arm_url, data=b"", method="POST"), timeout=5).read()
+
+
+def test_qbraid_result_auth_failure():
+    """401 on /result -> terminal auth error; message names the status."""
+    _arm_result_status(401)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    with pytest.raises(RuntimeError, match="authentication failed"):
+        cudaq.sample(kernel)
+
+
+def test_qbraid_result_forbidden():
+    """403 on /result -> same terminal auth translation as 401."""
+    _arm_result_status(403)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    with pytest.raises(RuntimeError, match="authentication failed"):
+        cudaq.sample(kernel)
+
+
+def test_qbraid_result_not_found():
+    """404 on /result -> terminal 'result not found' error."""
+    _arm_result_status(404)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    with pytest.raises(RuntimeError, match="result not found"):
+        cudaq.sample(kernel)
+
+
+def test_qbraid_result_server_error_retries():
+    """500 on /result is retryable; hook clears after one call so retry wins."""
+    _arm_result_status(500)
+    kernel = cudaq.make_kernel()
+    qubit = kernel.qalloc()
+    kernel.h(qubit)
+    kernel.mz(qubit)
+    counts = cudaq.sample(kernel)
+    assert len(counts) >= 1
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index e9ec8b46e5f..d1e005ba6ef 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -252,7 +252,37 @@ class QbraidServerHelper : public ServerHelper {
         }
 
       } catch (const std::exception &e) {
-        cudaq::info("Exception when fetching results: {}", e.what());
+        // RestClient throws std::runtime_error on any non-success HTTP status
+        // (see runtime/common/RestClient.cpp) with a fixed message format:
+        //   "HTTP <VERB> Error - status code <code>: <curl_err>: <body>"
+        // The code isn't exposed as a structured attribute, so we parse it
+        // out to distinguish terminal client errors (401/403/404) from
+        // transient server/network errors (5xx, parse errors) that retry.
+        static const std::regex statusRx(R"(status code (\d+))");
+        const std::string what = e.what();
+        std::smatch match;
+        int statusCode = 0;
+        if (std::regex_search(what, match, statusRx))
+          statusCode = std::stoi(match[1]);
+
+        // Terminal: auth failures - retrying will not recover.
+        if (statusCode == 401 || statusCode == 403)
+          throw std::runtime_error(
+              "qBraid authentication failed (HTTP " +
+              std::to_string(statusCode) +
+              "). Verify QBRAID_API_KEY or api_key target argument.");
+
+        // Terminal: result resource genuinely does not exist. This is
+        // distinct from the "not yet available" race which returns
+        // 200 + success=false (handled above).
+        if (statusCode == 404)
+          throw std::runtime_error(
+              "qBraid result not found (HTTP 404) for job " + jobId +
+              ". The job may have been deleted or never produced results.");
+
+        // Retryable: 5xx, network errors, JSON parse failures, etc.
+        cudaq::info("Exception when fetching results (attempt {}/{}): {}",
+                    attempt + 1, maxRetries, what);
         if (attempt < maxRetries - 1) {
           int sleepTime = (attempt == 0)
                               ? waitTime
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index bf5d4128fe2..ff59dbdd728 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -212,6 +212,82 @@ CUDAQ_TEST(QbraidTester, checkResultRetryExhaustion) {
   EXPECT_ANY_THROW({ (void)cudaq::sample(kernel); });
 }
 
+// Helper: arm the mock to return a specific HTTP status on the next /result.
+// Resets prior test-hook state first so the test is order-independent.
+static void armResultStatus(int code) {
+  cudaq::RestClient client;
+  nlohmann::json body = nlohmann::json::object();
+  std::map<std::string, std::string> headers;
+  (void)client.post("http://localhost:62454/", "test/reset", body, headers,
+                    /*enableLogging=*/false);
+  auto armed =
+      client.post("http://localhost:62454/",
+                  "test/force_next_result_status/" + std::to_string(code), body,
+                  headers, /*enableLogging=*/false);
+  ASSERT_EQ(armed.value("armed_status", -1), code);
+}
+
+// Helper: match a substring in the exception message.
+static ::testing::AssertionResult throwsWithMessage(std::function<void()> fn,
+                                                    const std::string &needle) {
+  try {
+    fn();
+  } catch (const std::exception &e) {
+    std::string what = e.what();
+    if (what.find(needle) != std::string::npos)
+      return ::testing::AssertionSuccess();
+    return ::testing::AssertionFailure()
+           << "exception message did not contain '" << needle << "'. Actual: '"
+           << what << "'";
+  }
+  return ::testing::AssertionFailure() << "expected exception, none thrown";
+}
+
+// 401 on /result -> terminal auth failure, message must name the status.
+CUDAQ_TEST(QbraidTester, checkResultAuthFailure) {
+  armResultStatus(401);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "authentication failed"));
+}
+
+// 403 on /result -> same terminal auth failure translation as 401.
+CUDAQ_TEST(QbraidTester, checkResultForbidden) {
+  armResultStatus(403);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "authentication failed"));
+}
+
+// 404 on /result -> terminal "not found", message must mention the job id.
+CUDAQ_TEST(QbraidTester, checkResultNotFound) {
+  armResultStatus(404);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "result not found"));
+}
+
+// 500 on /result -> retryable. Force hook fires once then clears, so the
+// second attempt succeeds. Sampling must not throw.
+CUDAQ_TEST(QbraidTester, checkResultServerErrorRetries) {
+  armResultStatus(500);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  auto counts = cudaq::sample(kernel);
+  EXPECT_GE(counts.size(), 1u);
+}
+
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   auto ret = RUN_ALL_TESTS();
diff --git a/utils/mock_qpu/qbraid/__init__.py b/utils/mock_qpu/qbraid/__init__.py
index b6f33926af0..066650fac86 100644
--- a/utils/mock_qpu/qbraid/__init__.py
+++ b/utils/mock_qpu/qbraid/__init__.py
@@ -45,6 +45,10 @@ class Job(BaseModel):
 # success=false (simulating the qbraid v2 race where status=COMPLETED before
 # results are queryable). Decrements on each /result call until 0.
 DELAY_RESULTS_COUNT = {"remaining": 0}
+# Testing hook: when set, the next GET /jobs/{id}/result call raises the given
+# HTTP status. Consumed (reset to None) after one call. Used to exercise the
+# helper's 401/403/404/5xx handling paths.
+FORCE_NEXT_RESULT_STATUS = {"code": None}
 
 
 def count_qubits(qasm: str) -> int:
@@ -208,6 +212,23 @@ async def armDelayResults(count: int = Path(...)):
     return {"remaining": count}
 
 
+# Test-only: force the next GET /result call to return the given HTTP status.
+# Consumed after one call.
+@app.post("/test/force_next_result_status/{code}")
+async def armForceResultStatus(code: int = Path(...)):
+    FORCE_NEXT_RESULT_STATUS["code"] = code
+    return {"armed_status": code}
+
+
+# Test-only: reset all test-hook globals so tests are order-independent.
+@app.post("/test/reset")
+async def resetTestState():
+    FAIL_NEXT_JOB["enabled"] = False
+    DELAY_RESULTS_COUNT["remaining"] = 0
+    FORCE_NEXT_RESULT_STATUS["code"] = None
+    return {"reset": True}
+
+
 # v2 API: GET /jobs/{job_qrn}
 @app.get("/jobs/{job_id}")
 async def getJob(
@@ -256,6 +277,14 @@ async def getJobResult(
         x_api_key: Optional[str] = Header(None, alias="X-API-KEY"),
 ):
     """Retrieve the results of a quantum job (v2 API)."""
+    # Test hook: if armed, raise the requested status. Checked first so tests
+    # can force 401/403 even when a valid api key is present.
+    if FORCE_NEXT_RESULT_STATUS["code"] is not None:
+        forced = FORCE_NEXT_RESULT_STATUS["code"]
+        FORCE_NEXT_RESULT_STATUS["code"] = None
+        raise HTTPException(status_code=forced,
+                            detail=f"Forced HTTP {forced} for test")
+
     if x_api_key is None:
         raise HTTPException(status_code=401, detail="API key is required")
 

From 188d1c1d98d10cfb4d6d83fc6c6bf4a046aaa1b1 Mon Sep 17 00:00:00 2001
From: "Adam T. Geller" <adgeller@nvidia.com>
Date: Thu, 23 Apr 2026 14:06:39 -0700
Subject: [PATCH 47/85] Add regression test for #2348 (#4371)

Adds a test for #3917. Will close #2348.

Signed-off-by: Adam Geller <adgeller@nvidia.com>
---
 python/tests/interop/qlib.py         | 12 ++++++++++++
 python/tests/interop/test_interop.py | 24 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 python/tests/interop/qlib.py

diff --git a/python/tests/interop/qlib.py b/python/tests/interop/qlib.py
new file mode 100644
index 00000000000..2a1c3ec6d75
--- /dev/null
+++ b/python/tests/interop/qlib.py
@@ -0,0 +1,12 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Simulates a Python package that re-exports C++ device kernels from a C++
+# extension module (issue #2348). The kernels are registered under
+# cudaq_test_cpp_algo, but are accessed here via the qlib namespace.
+from cudaq_test_cpp_algo import qstd
diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py
index de8ff4264c5..62118d289c1 100644
--- a/python/tests/interop/test_interop.py
+++ b/python/tests/interop/test_interop.py
@@ -196,6 +196,30 @@ def call_call_c_twice():
     call_call_c_twice()
 
 
+def test_cpp_kernel_from_python_4():
+    """Regression test for issue #2348."""
+    pytest.importorskip('cudaq_test_cpp_algo')
+
+    import qlib
+
+    # Sanity checks
+    print(qlib.qstd.qft)
+    print(qlib.qstd.another)
+
+    @cudaq.kernel
+    def callQftAndAnother():
+        q = cudaq.qvector(4)
+        qlib.qstd.qft(q)
+        h(q)
+        qlib.qstd.another(q, 2)
+
+    callQftAndAnother()
+
+    counts = cudaq.sample(callQftAndAnother)
+    counts.dump()
+    assert len(counts) == 1 and '0010' in counts
+
+
 def test_callbacks():
     pytest.importorskip('cudaq_test_cpp_algo')
 

From 1c4370eea1f1cee3c0dbef941a34a7a41d3b008e Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Thu, 23 Apr 2026 15:09:35 -0700
Subject: [PATCH 48/85] Moving CustomPassPlugin from unittests to test (#4381)

- Moving CustomPassPlugin from unittests to test as it is used only by
lit tests.
- Adding `CustomPassPlugin` to `targettests/CMakeLists.txt`
- Excluding `test/plugin` from lit discovery so the plugin source is not
picked up as a test

Fixes #677

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 targettests/CMakeLists.txt                      | 1 +
 test/CMakeLists.txt                             | 2 ++
 test/lit.cfg.py                                 | 2 +-
 {unittests => test}/plugin/CMakeLists.txt       | 0
 {unittests => test}/plugin/CustomPassPlugin.cpp | 0
 unittests/CMakeLists.txt                        | 2 --
 6 files changed, 4 insertions(+), 3 deletions(-)
 rename {unittests => test}/plugin/CMakeLists.txt (100%)
 rename {unittests => test}/plugin/CustomPassPlugin.cpp (100%)

diff --git a/targettests/CMakeLists.txt b/targettests/CMakeLists.txt
index 064f116987a..82ed3de6049 100644
--- a/targettests/CMakeLists.txt
+++ b/targettests/CMakeLists.txt
@@ -35,6 +35,7 @@ set(CUDAQ_TEST_DEPENDS
     cudaq-translate
     FileCheck
     cudaq-qpud
+    CustomPassPlugin
 )
 # We require split-file, which should be installed along with FileCheck, but
 # the CI doesn't do it. Comment this out and open a bug.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dc84306391d..49a70201702 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,6 +15,8 @@ if (CUDAQ_DISABLE_RUNTIME)
   set(CUDAQ_EXTRA_CONFIG_EXCLUDES "NVQPP")
 endif()
 
+add_subdirectory(plugin)
+
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index cc62231f3a2..f0bca187850 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -49,7 +49,7 @@
 
 # Exclude a list of directories from the test suite:
 #   - 'Inputs' contain auxiliary inputs for various tests.
-local_excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
+local_excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt', 'plugin']
 config.excludes = [exclude for exclude in config.excludes] + local_excludes
 
 # The root path where tests are located.
diff --git a/unittests/plugin/CMakeLists.txt b/test/plugin/CMakeLists.txt
similarity index 100%
rename from unittests/plugin/CMakeLists.txt
rename to test/plugin/CMakeLists.txt
diff --git a/unittests/plugin/CustomPassPlugin.cpp b/test/plugin/CustomPassPlugin.cpp
similarity index 100%
rename from unittests/plugin/CustomPassPlugin.cpp
rename to test/plugin/CustomPassPlugin.cpp
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 08b05c2fd28..5144ce8bd57 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -443,8 +443,6 @@ if (CUDA_FOUND AND TARGET nvqir-dynamics)
   endif()
 endif()
 
-add_subdirectory(plugin)
-
 # build the test qudit execution manager
 add_subdirectory(qudit)
 add_executable(test_qudit main.cpp qudit/SimpleQuditTester.cpp)

From 89d487fdfffe07299c9ae84322c3ec3e638536de Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Thu, 23 Apr 2026 16:09:16 -0700
Subject: [PATCH 49/85] Updating sample_async doc (#4383)

Updating sample_async doc with noise_model.

Fixes #843

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 python/cudaq/runtime/sample.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/cudaq/runtime/sample.py b/python/cudaq/runtime/sample.py
index 0e44761557f..b2c97d97800 100644
--- a/python/cudaq/runtime/sample.py
+++ b/python/cudaq/runtime/sample.py
@@ -224,6 +224,9 @@ def sample_async(decorator,
       explicit_measurements (Optional[bool]): A flag to indicate whether or not
           to concatenate measurements in execution order for the returned
           sample result.
+      noise_model (Optional[`NoiseModel`]): The optional :class:`NoiseModel`
+          to add noise to the kernel execution on the simulator. Defaults to
+          an empty noise model.
       `qpu_id` (Optional[int]): The optional identification for which QPU
           on the platform to target. Defaults to zero. Key-word only.
 

From c1bae57665671de084fa2be3b6ecff5d672f82c3 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 09:37:56 +1000
Subject: [PATCH 50/85] [Kernel Builder][OpenQASM] Handle control/adjoint entry
 point attribute (#4374)

The issue reported in https://github.com/NVIDIA/cuda-quantum/issues/2465
is already resolved in the revised Python implementation. However, there
is another hiccup in `kernel_builder` when it handles control/adjoint of
another kernel: we forgot to remove the `entrypoint` attribute, causing
multiple functions with that attribute in a module -> QASM2 codegen
doesn't like that.

Hence, this fixes the issue and adds a couple of regression tests for
QASM translate.

Resolved: https://github.com/NVIDIA/cuda-quantum/issues/2465

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/kernel/kernel_builder.py  |  7 +-
 python/tests/builder/test_translate.py | 99 ++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 16eb0569636..4abc9fb8733 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -653,7 +653,6 @@ def functor(op):
                 cloned = otherST[calleeName].operation.clone()
                 if 'cudaq-entrypoint' in cloned.operation.attributes:
                     cloned.operation.attributes.__delitem__('cudaq-entrypoint')
-                print("adding", cloned)
                 currentModule.body.append(cloned)
 
                 visitAllCallOps(cloned)
@@ -682,6 +681,12 @@ def __applyControlOrAdjoint(self, target, isAdjoint, controls, *args):
                 otherFuncCloned, otherModule = self.__cloneOrGetFunction(
                     target.name, self.module, target)
                 assert isinstance(otherFuncCloned, func.FuncOp)
+                # Same as __addAllCalledFunctionsRecursively does for
+                # transitively called functions: a sub-kernel merged into this
+                # module is no longer an `entrypoint`.
+                if 'cudaq-entrypoint' in otherFuncCloned.operation.attributes:
+                    otherFuncCloned.operation.attributes.__delitem__(
+                        'cudaq-entrypoint')
                 self.__addAllCalledFunctionsRecursively(otherFuncCloned,
                                                         self.module,
                                                         otherModule)
diff --git a/python/tests/builder/test_translate.py b/python/tests/builder/test_translate.py
index f3995eabde9..d1709dfd059 100644
--- a/python/tests/builder/test_translate.py
+++ b/python/tests/builder/test_translate.py
@@ -6,6 +6,8 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
+import math
+import re
 import numpy as np
 import cudaq
 
@@ -77,3 +79,100 @@ def test_get_unitary_builder():
     expected = (1 / np.sqrt(2)) * np.array([[1, 1], [1, -1]],
                                            dtype=np.complex128)
     np.testing.assert_allclose(unitary, expected, atol=1e-12)
+
+
+def _adjoint_openqasm(build_inner):
+    # build_inner() returns the sub-kernel to embed via adjoint().
+    # Builds a one-qubit outer kernel, applies the adjoint, and translates to OpenQASM 2.
+    outer = cudaq.make_kernel()
+    reg = outer.qalloc(1)
+    outer.adjoint(build_inner(), reg[0])
+    return cudaq.translate(outer, format="openqasm2")
+
+
+def _parse_qasm2_rotation_ops(qasm):
+    # Return list of (gate_name, angle) for every rx/ry/rz in the circuit
+    # body (after the qreg declaration, skipping gate definition blocks).
+    body = qasm[qasm.index('qreg'):]
+    return [(m.group(1), float(m.group(2)))
+            for m in re.finditer(r'\b(r[xyz])\(([^)]+)\)', body)]
+
+
+def test_translate_builder_adjoint_s_openqasm():
+    # adjoint(s) should produce rz(-pi/2), the s-dagger equivalent.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.s(q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rz"
+    assert math.isclose(angle, -math.pi / 2, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_rx_openqasm():
+    # adjoint(rx(pi/3)) should negate the angle to -pi/3.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.rx(math.pi / 3, q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rx"
+    assert math.isclose(angle, -math.pi / 3, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_ry_openqasm():
+    # adjoint(ry(pi/4)) should negate the angle to -pi/4.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.ry(math.pi / 4, q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "ry"
+    assert math.isclose(angle, -math.pi / 4, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_rz_openqasm():
+    # adjoint(rz(pi/6)) should negate the angle to -pi/6.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.rz(math.pi / 6, q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rz"
+    assert math.isclose(angle, -math.pi / 6, rel_tol=1e-5)
+
+
+def test_translate_builder_adjoint_t_openqasm():
+    # adjoint(t) should produce rz(-pi/4), the t-dagger equivalent.
+    def inner():
+        k, q = cudaq.make_kernel(cudaq.qubit)
+        k.t(q)
+        return k
+
+    asm = _adjoint_openqasm(inner)
+    assert "OPENQASM 2.0;" in asm
+    ops = _parse_qasm2_rotation_ops(asm)
+    assert len(ops) == 1
+    gate, angle = ops[0]
+    assert gate == "rz"
+    assert math.isclose(angle, -math.pi / 4, rel_tol=1e-5)

From dca7c7adf2356325bfb259325fb59c5e8ff1d6be Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 09:40:46 +1000
Subject: [PATCH 51/85] Fixes and tests for nested control and adjoint
 modifiers (#4372)

We have mostly supported nested control/adjoint as requested in #854.

- `ctrl(adj(k))` works

- `adj(ctrl(k))` does not

This is because `createAdjointVariantOf` doesn't handle `ApplyOp`, while
`createControlVariantOf` does.

Hence, update it so that `createAdjointVariantOf` can handle `ApplyOp`
as well (toggle the adjoint flag).

Added quake and execution tests.

Resolved #854

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 .../Transforms/ApplyOpSpecialization.cpp      | 23 ++++-
 targettests/execution/adjoint_control.cpp     | 81 +++++++++++++++++
 test/AST-Quake/adjoint_control.cpp            | 90 +++++++++++++++++++
 3 files changed, 192 insertions(+), 2 deletions(-)
 create mode 100644 targettests/execution/adjoint_control.cpp
 create mode 100644 test/AST-Quake/adjoint_control.cpp

diff --git a/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp b/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
index ad45ca10c05..c428bfdec2a 100644
--- a/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
+++ b/lib/Optimizer/Transforms/ApplyOpSpecialization.cpp
@@ -548,7 +548,13 @@ class ApplySpecializationPass
           << "cannot make adjoint of kernel: unstructured control flow\n");
       return failure();
     }
-    if (cudaq::opt::hasCallOp(func)) {
+    // quake::ApplyOp implements CallOpInterface but can be handled below by
+    // toggling isAdj. Reject any other call-like op that we cannot invert.
+    if (cudaq::opt::internal::hasCharacteristic(
+            [](Operation &op) {
+              return isa<mlir::CallOpInterface>(op) && !isa<quake::ApplyOp>(op);
+            },
+            *func.getOperation())) {
       LLVM_DEBUG(llvm::dbgs() << "cannot make adjoint of kernel with calls\n");
       return failure();
     }
@@ -583,7 +589,7 @@ class ApplySpecializationPass
   static SmallVector<Operation *> getOpsToInvert(Block &block) {
     SmallVector<Operation *> ops;
     for (auto &op : block)
-      if (cudaq::opt::hasQuantum(op))
+      if (cudaq::opt::hasQuantum(op) || isa<quake::ApplyOp>(op))
         ops.push_back(&op);
     return ops;
   }
@@ -775,6 +781,19 @@ class ApplySpecializationPass
         continue;
       }
 
+      if (auto applyOp = dyn_cast<quake::ApplyOp>(op)) {
+        LLVM_DEBUG(llvm::dbgs() << "moving apply op: " << *op << ".\n");
+        // Adjoint of an ApplyOp: toggles the isAdj flag.
+        mlir::UnitAttr newIsAdj =
+            applyOp.getIsAdj() ? mlir::UnitAttr{}
+                               : mlir::UnitAttr::get(builder.getContext());
+        builder.create<quake::ApplyOp>(
+            applyOp.getLoc(), applyOp.getResultTypes(), applyOp.getCalleeAttr(),
+            newIsAdj, applyOp.getControls(), applyOp.getActuals());
+        applyOp->erase();
+        continue;
+      }
+
       bool opWasNegated = false;
       IRMapping mapper;
       LLVM_DEBUG(llvm::dbgs() << "moving quantum op: " << *op << ".\n");
diff --git a/targettests/execution/adjoint_control.cpp b/targettests/execution/adjoint_control.cpp
new file mode 100644
index 00000000000..d6174c27bb8
--- /dev/null
+++ b/targettests/execution/adjoint_control.cpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ --enable-mlir %s -o %t && %t | FileCheck %s
+// clang-format on
+
+// Verify that the controlled-adjoint of a kernel produces the correct inverse
+// action. See: https://github.com/NVIDIA/cuda-quantum/issues/854
+
+#include <cstdio>
+#include <cudaq.h>
+
+struct s_gate {
+  void operator()(cudaq::qubit &q) __qpu__ { s(q); }
+};
+
+struct s_adj {
+  void operator()(cudaq::qubit &q) __qpu__ { cudaq::adjoint(s_gate{}, q); }
+};
+
+// Wrapper that applies s_gate under a control qubit passed as a regular arg.
+struct s_ctrl {
+  void operator()(cudaq::qubit &ctrl, cudaq::qubit &q) __qpu__ {
+    cudaq::control(s_gate{}, {ctrl}, q);
+  }
+};
+
+// S gate is used because S != S_dagger, making an incorrect adjoint detectable:
+// S_dagger*S|+> = |+> (correct, q -> 0), but S*S|+> = Z|+> = |-> (wrong, q ->
+// 1).
+
+// Approach 1: control(adj(S)) -- wrap the adjoint in a struct, then control it.
+// Circuit:
+//   ctrl = |1>, q = |+>
+//   S(q)           -> q = S|+>
+//   ctrl(S_dag)(q) (with ctrl = |1>) -> q = S_dag*S|+> = |+>
+//   H(q)           -> q = |0>
+// Expected bitstring: ctrl=1, q=0 -> "10"
+struct ctrl_adj_s {
+  void operator()() __qpu__ {
+    cudaq::qubit ctrl, q;
+    x(ctrl);
+    h(q);
+    s(q);
+    cudaq::control(s_adj{}, {ctrl}, q);
+    h(q);
+  }
+};
+
+// Approach 2: adj(control(S)) -- wrap the controlled form in a struct, then
+// adjoint it.
+// Same expected result as approach 1 since adj(ctrl(U)) = ctrl(adj(U)).
+struct adj_ctrl_s {
+  void operator()() __qpu__ {
+    cudaq::qubit ctrl, q;
+    x(ctrl);
+    h(q);
+    s(q);
+    cudaq::adjoint(s_ctrl{}, ctrl, q);
+    h(q);
+  }
+};
+
+int main() {
+  auto counts1 = cudaq::sample(ctrl_adj_s{});
+  for (auto &[bits, count] : counts1)
+    printf("%s\n", bits.data());
+  auto counts2 = cudaq::sample(adj_ctrl_s{});
+  for (auto &[bits, count] : counts2)
+    printf("%s\n", bits.data());
+  return 0;
+}
+
+// CHECK: 10
+// CHECK: 10
diff --git a/test/AST-Quake/adjoint_control.cpp b/test/AST-Quake/adjoint_control.cpp
new file mode 100644
index 00000000000..ebb34556709
--- /dev/null
+++ b/test/AST-Quake/adjoint_control.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: cudaq-quake %s | FileCheck %s
+
+// Verify adjoint and controlled modifiers can be combined in two ways:
+//   1. control(adj(k)): wrap adjoint(k) in a `__qpu__` struct, then control it.
+//   2. adj(control(k)): wrap control(k) in a `__qpu__` struct, then adjoint it.
+// (https://github.com/NVIDIA/cuda-quantum/issues/854)
+
+#include <cudaq.h>
+
+struct k {
+  void operator()(cudaq::qview<> q) __qpu__ {
+    h(q[0]);
+    t(q[1]);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__k(
+// CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>
+// CHECK:           quake.h %{{.*}}
+// CHECK:           quake.t %{{.*}}
+// CHECK:           return
+// clang-format on
+
+struct k_adj {
+  void operator()(cudaq::qview<> q) __qpu__ { cudaq::adjoint(k{}, q); }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__k_adj(
+// CHECK-SAME:      %[[VAL_0:.*]]: !quake.veq<?>
+// CHECK:           quake.apply<adj> @__nvqpp__mlirgen__k %[[VAL_0]]
+// CHECK:           return
+// clang-format on
+
+struct ep {
+  void operator()() __qpu__ {
+    cudaq::qarray<2> q;
+    cudaq::qubit ctrl;
+    cudaq::control(k_adj{}, {ctrl}, q);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ep()
+// CHECK:           %[[CTRL:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[Q:.*]] = quake.relax_size %{{.*}} : (!quake.veq<2>) -> !quake.veq<?>
+// CHECK:           quake.apply @__nvqpp__mlirgen__k_adj {{\[}}%[[CTRL]]] %[[Q]]
+// CHECK:           return
+// clang-format on
+
+// Approach 2: adj(control(k)) -- wrap control(k) in a `__qpu__` struct, adjoint
+// it.
+struct k_ctrl {
+  void operator()(cudaq::qubit &ctrl, cudaq::qview<> q) __qpu__ {
+    cudaq::control(k{}, {ctrl}, q);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__k_ctrl(
+// CHECK-SAME:      %[[CTRL:.*]]: !quake.ref
+// CHECK-SAME:      %[[Q:.*]]: !quake.veq<?>
+// CHECK:           quake.apply @__nvqpp__mlirgen__k {{\[}}%[[CTRL]]] %[[Q]]
+// CHECK:           return
+// clang-format on
+
+struct ep2 {
+  void operator()() __qpu__ {
+    cudaq::qarray<2> q;
+    cudaq::qubit ctrl;
+    cudaq::adjoint(k_ctrl{}, ctrl, q);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ep2()
+// CHECK:           %[[CTRL2:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[Q2:.*]] = quake.relax_size %{{.*}} : (!quake.veq<2>) -> !quake.veq<?>
+// CHECK:           quake.apply<adj> @__nvqpp__mlirgen__k_ctrl %[[CTRL2]], %[[Q2]]
+// CHECK:           return
+// clang-format on

From 9de9c626faec5231198111fb63c17b2f024f7fd5 Mon Sep 17 00:00:00 2001
From: "Adam T. Geller" <adgeller@nvidia.com>
Date: Thu, 23 Apr 2026 17:23:10 -0700
Subject: [PATCH 52/85] [testing] Add more Python kernel lookup regression
 tests (#4382)

Will resolve #2344 and #2346

---------

Signed-off-by: Adam Geller <adgeller@nvidia.com>
---
 .../issue_2344/hidden_module/__init__.py      | 14 +++++++
 .../kernel/issue_2344/test_issue_2344.py      | 36 ++++++++++++++++++
 .../kernel/issue_2346/algo_lib/__init__.py    |  9 +++++
 .../issue_2346/algo_lib/gates/__init__.py     |  7 ++++
 .../issue_2346/algo_lib/gates/qft_ops.py      | 14 +++++++
 .../kernel/issue_2346/test_issue_2346.py      | 38 +++++++++++++++++++
 6 files changed, 118 insertions(+)
 create mode 100644 python/tests/kernel/issue_2344/hidden_module/__init__.py
 create mode 100644 python/tests/kernel/issue_2344/test_issue_2344.py
 create mode 100644 python/tests/kernel/issue_2346/algo_lib/__init__.py
 create mode 100644 python/tests/kernel/issue_2346/algo_lib/gates/__init__.py
 create mode 100644 python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py
 create mode 100644 python/tests/kernel/issue_2346/test_issue_2346.py

diff --git a/python/tests/kernel/issue_2344/hidden_module/__init__.py b/python/tests/kernel/issue_2344/hidden_module/__init__.py
new file mode 100644
index 00000000000..803799868dd
--- /dev/null
+++ b/python/tests/kernel/issue_2344/hidden_module/__init__.py
@@ -0,0 +1,14 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+
+
+@cudaq.kernel
+def spooky_kernel():
+    cudaq.qubit()
diff --git a/python/tests/kernel/issue_2344/test_issue_2344.py b/python/tests/kernel/issue_2344/test_issue_2344.py
new file mode 100644
index 00000000000..72775baf240
--- /dev/null
+++ b/python/tests/kernel/issue_2344/test_issue_2344.py
@@ -0,0 +1,36 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Regression test for issue #2344:
+# A kernel defined in another module (hidden_module.spooky_kernel) must NOT be
+# callable without its module qualifier. The call `spooky_kernel()` inside
+# test0 should raise a NameError because spooky_kernel is not in scope.
+#
+# Package layout (relative to this file):
+#   hidden_module/__init__.py    <- defines @cudaq.kernel def spooky_kernel()
+
+import cudaq
+import pytest
+
+import hidden_module  # noqa: F401 — imported for kernel registration side effect
+
+
+@pytest.fixture(autouse=True)
+def clear_registries():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def test_unqualified_cross_module_kernel_call_raises():
+
+    @cudaq.kernel
+    def test0():
+        spooky_kernel()  # not imported — should not resolve
+
+    with pytest.raises((NameError, RuntimeError)):
+        cudaq.sample(test0)
diff --git a/python/tests/kernel/issue_2346/algo_lib/__init__.py b/python/tests/kernel/issue_2346/algo_lib/__init__.py
new file mode 100644
index 00000000000..ea0aa8bf146
--- /dev/null
+++ b/python/tests/kernel/issue_2346/algo_lib/__init__.py
@@ -0,0 +1,9 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+from .gates import qft_ops
diff --git a/python/tests/kernel/issue_2346/algo_lib/gates/__init__.py b/python/tests/kernel/issue_2346/algo_lib/gates/__init__.py
new file mode 100644
index 00000000000..bec24eb03db
--- /dev/null
+++ b/python/tests/kernel/issue_2346/algo_lib/gates/__init__.py
@@ -0,0 +1,7 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
diff --git a/python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py b/python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py
new file mode 100644
index 00000000000..16e39f4004f
--- /dev/null
+++ b/python/tests/kernel/issue_2346/algo_lib/gates/qft_ops.py
@@ -0,0 +1,14 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+
+
+@cudaq.kernel
+def qft_kernel():
+    cudaq.qubit()
diff --git a/python/tests/kernel/issue_2346/test_issue_2346.py b/python/tests/kernel/issue_2346/test_issue_2346.py
new file mode 100644
index 00000000000..76bb3dd914e
--- /dev/null
+++ b/python/tests/kernel/issue_2346/test_issue_2346.py
@@ -0,0 +1,38 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# Regression test for issue #2346:
+# A kernel called via an intermediate import path (gates.qft_ops.qft_kernel)
+# must resolve even though it is registered under the fully-qualified path
+# (algo_lib.gates.qft_ops.qft_kernel).
+#
+# Package layout (relative to this file):
+#   algo_lib/__init__.py              <- from .gates import qft_ops
+#   algo_lib/gates/__init__.py
+#   algo_lib/gates/qft_ops.py         <- defines @cudaq.kernel def qft_kernel()
+
+import cudaq
+import pytest
+
+from algo_lib import gates
+
+
+@pytest.fixture(autouse=True)
+def clear_registries():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def test_kernel_call_via_partial_module_path():
+
+    @cudaq.kernel
+    def test0():
+        gates.qft_ops.qft_kernel()
+
+    counts = cudaq.sample(test0)
+    assert '0' in counts

From 437e7a6b01922d1a04ac0ebc4cc8cae877822f55 Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Thu, 23 Apr 2026 17:30:05 -0700
Subject: [PATCH 53/85] Skipping full Mottonen decomposition (#4384)

Before this change, a 16-qubit basis state input emitted `2^n controlled
rotations` and took `29 seconds` to build and sample. Here we are adding
a path that detects input with one non-zero amplitude and emit `X` gates
on the set bits of that index.

Result: This has sped up the execution time by **289x** (For 16 qubits,
29 s -> 100 ms).

Fixes #765

---------

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 runtime/cudaq/builder/kernels.h          | 25 +++++++++++++++
 unittests/integration/kernels_tester.cpp | 40 ++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/runtime/cudaq/builder/kernels.h b/runtime/cudaq/builder/kernels.h
index bc63601690c..6ae83c289f3 100644
--- a/runtime/cudaq/builder/kernels.h
+++ b/runtime/cudaq/builder/kernels.h
@@ -69,6 +69,31 @@ void from_state(Kernel &&kernel, QuakeValue &qubits,
         "[from_state] cannot infer size of input quantum register, please "
         "specify the number of qubits via the from_state() final argument.");
 
+  constexpr double basisTol = 1e-12;
+  std::size_t nonZeroCount = 0;
+  std::size_t nonZeroIdx = 0;
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    if (std::abs(data[i]) > basisTol) {
+      ++nonZeroCount;
+      nonZeroIdx = i;
+      if (nonZeroCount > 1)
+        break;
+    }
+  }
+  if (nonZeroCount == 0)
+    throw std::invalid_argument(
+        "[from_state] input state vector is all zeros; a quantum state "
+        "must have unit norm.");
+  if (nonZeroCount == 1) {
+    // Möttönen ordering: state-vector index MSB maps to qubits[0], LSB to
+    // qubits[numQubits-1].
+    auto nq = static_cast<std::size_t>(numQubits);
+    for (std::size_t q = 0; q < nq; ++q)
+      if ((nonZeroIdx >> (nq - 1 - q)) & 1)
+        kernel.x(qubits[q]);
+    return;
+  }
+
   auto mutableQubits = cudaq::range(numQubits);
   std::reverse(mutableQubits.begin(), mutableQubits.end());
   bool omegaNonZero = false;
diff --git a/unittests/integration/kernels_tester.cpp b/unittests/integration/kernels_tester.cpp
index 243db537e45..141f5cf4c52 100644
--- a/unittests/integration/kernels_tester.cpp
+++ b/unittests/integration/kernels_tester.cpp
@@ -181,6 +181,46 @@ CUDAQ_TEST(KernelsTester, checkFromState) {
   }
 }
 
+CUDAQ_TEST(KernelsTester, checkFromStateBasis) {
+  auto verifyBasis = [](std::size_t numQubits, std::size_t idx) {
+    std::vector<std::complex<double>> state(1ULL << numQubits, 0.0);
+    state[idx] = 1.0;
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(numQubits);
+    cudaq::from_state(kernel, qubits, state);
+    auto ss = cudaq::get_state(kernel);
+    for (std::size_t i = 0; i < state.size(); i++)
+      EXPECT_NEAR(std::abs(ss[i] - state[i]), 0.0, 1e-6);
+  };
+
+  for (std::size_t idx = 0; idx < 8; idx++)
+    verifyBasis(3, idx);
+
+  verifyBasis(4, 0);
+  verifyBasis(4, 5);
+  verifyBasis(4, 15);
+
+  {
+    std::vector<std::complex<double>> zero(8, 0.0);
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(3);
+    EXPECT_THROW(cudaq::from_state(kernel, qubits, zero),
+                 std::invalid_argument);
+  }
+
+  {
+    constexpr std::size_t numQubits = 16;
+    std::vector<std::complex<double>> state(1ULL << numQubits, 0.0);
+    state[0] = 1.0;
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(numQubits);
+    cudaq::from_state(kernel, qubits, state);
+    auto counts = cudaq::sample(kernel);
+    EXPECT_EQ(counts.size(), 1u);
+    EXPECT_EQ(counts.begin()->first, std::string(numQubits, '0'));
+  }
+}
+
 CUDAQ_TEST(KernelsTester, checkSampleBug2937) {
   constexpr int qubit_count = 20;
   auto kernel = cudaq::make_kernel();

From f368a6f1be1735d12afcac9f1f3cef968214b036 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 12:24:34 +1000
Subject: [PATCH 54/85] [Dynamics] Support custom time stepper injection 
 (#4354)

The dynamics integrator API exposes a custom stepper injection, but it
did not work.

This PR fixes that:

- Make sure optional custom time stepper is used if provided. Also, this
custom stepper injection won't interfere with the existing default
stepper construction whenever the system dynamics changes.

- In the torch integrator: make the code more generic when handling
steppers that are not the built-in one (which optimizes the data
conversion). Also, move some code out of the `compute_rhs` for better
performance.

- Add test cases for custom stepper injection.

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/dynamics/integrator.py           |  12 +-
 .../cuda_torchdiffeq_integrator.py            |  74 ++++++-----
 .../dynamics/integrators/scipy_integrators.py |  13 +-
 .../test_evolve_dynamics_torch_integrators.py | 115 ++++++++++++++++++
 python/tests/dynamics/test_evolve_dynamics.py |  57 +++++++++
 5 files changed, 231 insertions(+), 40 deletions(-)

diff --git a/python/cudaq/dynamics/integrator.py b/python/cudaq/dynamics/integrator.py
index 019beee38b1..134e40b18e2 100644
--- a/python/cudaq/dynamics/integrator.py
+++ b/python/cudaq/dynamics/integrator.py
@@ -35,7 +35,14 @@ def __init__(self, **kwargs):
         self.dimensions = None
         self.schedule = None
         self.hamiltonian = None
+        # The actual stepper used for integration.
+        # This may be set in the constructor with a user-provided stepper,
+        # or it may be auto-created by the integrator when `integrate()` is called based on the system dynamics.
         self.stepper = None
+        # User-provided stepper.
+        # This will be used for integration if provided.
+        # Note: it's user's responsibility to ensure that the provided stepper is compatible with the system dynamics.
+        self._user_provided_stepper = None
         self.collapse_operators = None
         self.super_op = None
         self.__post_init__()
@@ -69,7 +76,10 @@ def set_system(self,
             self.hamiltonian = hamiltonian
 
         self.collapse_operators = collapse_operators
-        self.stepper = None
+        # Restore the user-provided stepper if one was given at construction,
+        # otherwise reset to None so `integrate()` builds a fresh stepper from
+        # the new system dynamics.
+        self.stepper = self._user_provided_stepper
 
     @abstractmethod
     def integrate(self, t):
diff --git a/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py b/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py
index 50e4dd90a0b..52863a2cf48 100644
--- a/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py
+++ b/python/cudaq/dynamics/integrators/cuda_torchdiffeq_integrator.py
@@ -9,6 +9,7 @@
 from ..integrator import BaseTimeStepper, BaseIntegrator
 from .builtin_integrators import cuDensityMatTimeStepper, cuDensityMatSuperOpTimeStepper
 from ...mlir._mlir_libs._quakeDialects import cudaq_runtime
+from typing import Optional
 import math
 
 has_cupy = True
@@ -71,7 +72,7 @@ class CUDATorchDiffEqIntegrator(BaseIntegrator[cudaq_runtime.State]):
     rtol = 1e-7
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State],
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  solver: str = 'rk4',
                  **kwargs):
         if not has_dynamics:
@@ -96,7 +97,8 @@ def __init__(self,
                 'CuPy is required to use Torch-based integrators.')
 
         super().__init__(**kwargs)
-        self.stepper = stepper
+        # Store the user-provided stepper so it survives `set_system()` calls.
+        self._user_provided_stepper = stepper
         self.solver = solver
         self.dm_shape = None
         self.n_steps = 10
@@ -105,6 +107,7 @@ def __init__(self,
         self.batchSize = None
         self._dimensions_list = None
         self._solver_instance = None
+        self._use_compute_inplace = None
 
     def compute_rhs(self, t, vec):
         if torch.is_tensor(t):
@@ -121,21 +124,25 @@ def compute_rhs(self, t, vec):
         device_ptr = vec.data_ptr()
         size = vec.numel()
 
-        if self._dimensions_list is None:
-            self._dimensions_list = list(self.dimensions)
-
         # Wrap the device pointer as a `cudaq::state` (no copy)
         temp_state = bindings.initializeState(device_ptr, size,
                                               self._dimensions_list,
                                               self.batchSize)
-        # Pre-allocate output tensor (torch tensor)
-        result_vec = torch.zeros_like(vec)
-        # Wrap the output tensor device pointer as a `cudaq::state` (no copy)
-        result_state = bindings.initializeState(result_vec.data_ptr(), size,
-                                                self._dimensions_list,
-                                                self.batchSize)
-        # Compute the RHS into the output state
-        self.stepper.compute_inplace(temp_state, t_scalar, result_state)
+        if self._use_compute_inplace:
+            # If `compute_inplace` is available, use it to avoid extra data conversion (`dlpack` conversion between `torch` and `cupy`).
+            # Pre-allocate output tensor (torch tensor)
+            result_vec = torch.zeros_like(vec)
+            # Wrap the output tensor device pointer as a `cudaq::state` (no copy)
+            result_state = bindings.initializeState(result_vec.data_ptr(), size,
+                                                    self._dimensions_list,
+                                                    self.batchSize)
+            self.stepper.compute_inplace(temp_state, t_scalar, result_state)
+        else:
+            # Stepper only provides compute(); call it and convert the returned
+            # state back to a torch tensor via `dlpack` (no extra copy).
+            result_state_obj = self.stepper.compute(temp_state, t_scalar)
+            result_cupy = to_cupy_array(result_state_obj)
+            result_vec = torch.from_dlpack(result_cupy)
         return result_vec
 
     def _create_wrapped_rhs_func(self):
@@ -176,6 +183,11 @@ def _get_solver_class(self):
         return solver_map.get(self.solver)
 
     def integrate(self, t):
+        if self.is_density_state is None:
+            self.is_density_state = (
+                (math.prod(self.dimensions)**2 *
+                 self.batchSize) == self.state.getTensor().get_num_elements())
+
         if self.stepper is None:
             if self.dimensions is None:
                 raise ValueError(
@@ -188,10 +200,6 @@ def integrate(self, t):
                 )
             self.schedule_ = bindings.Schedule(self.schedule._steps,
                                                list(self.schedule._parameters))
-            if self.is_density_state is None:
-                self.is_density_state = (
-                    (math.prod(self.dimensions)**2 * self.batchSize
-                    ) == self.state.getTensor().get_num_elements())
 
             if self.super_op is None:
                 # Create a stepper based on the provided Hamiltonian and collapse operators
@@ -205,6 +213,11 @@ def integrate(self, t):
                 self.stepper = cuDensityMatSuperOpTimeStepper(
                     self.super_op, self.schedule_, list(self.dimensions))
 
+        # Cache whether the stepper provides `compute_inplace` to dispatch proper call in `compute_rhs`.
+        self._use_compute_inplace = hasattr(self.stepper, 'compute_inplace')
+        if self._dimensions_list is None:
+            self._dimensions_list = list(self.dimensions)
+
         if t <= self.t:
             raise ValueError(
                 "Integration time must be greater than current time")
@@ -256,9 +269,6 @@ def integrate(self, t):
         # convert the solution back to CuPy array
         y_t_cupy = cp.from_dlpack(y_t)
 
-        if self._dimensions_list is None:
-            self._dimensions_list = list(self.dimensions)
-
         # Keep results in GPU memory
         self.state = cudaq_runtime.State.from_data(y_t_cupy)
         self.state = bindings.initializeState(self.state, self._dimensions_list,
@@ -275,7 +285,7 @@ def set_state(self, state: cudaq_runtime.State, t: float = 0.0):
 class CUDATorchDiffEqRK4Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='rk4', **kwargs)
 
@@ -283,7 +293,7 @@ def __init__(self,
 class CUDATorchDiffEqEulerIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='euler', **kwargs)
 
@@ -291,7 +301,7 @@ def __init__(self,
 class CUDATorchDiffEqMidpointIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='midpoint', **kwargs)
 
@@ -299,7 +309,7 @@ def __init__(self,
 class CUDATorchDiffEqDopri5Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='dopri5', **kwargs)
 
@@ -307,7 +317,7 @@ def __init__(self,
 class CUDATorchDiffEqDopri8Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='dopri8', **kwargs)
 
@@ -315,7 +325,7 @@ def __init__(self,
 class CUDATorchDiffEqBosh3Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='bosh3', **kwargs)
 
@@ -323,7 +333,7 @@ def __init__(self,
 class CUDATorchDiffEqAdaptiveHeunIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='adaptive_heun', **kwargs)
 
@@ -331,7 +341,7 @@ def __init__(self,
 class CUDATorchDiffEqExplicitAdamsIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='explicit_adams', **kwargs)
 
@@ -339,7 +349,7 @@ def __init__(self,
 class CUDATorchDiffEqFehlberg2Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='fehlberg2', **kwargs)
 
@@ -347,7 +357,7 @@ def __init__(self,
 class CUDATorchDiffEqHeun3Integrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='heun3', **kwargs)
 
@@ -355,7 +365,7 @@ def __init__(self,
 class CUDATorchDiffEqImplicitAdamsIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='implicit_adams', **kwargs)
 
@@ -363,6 +373,6 @@ def __init__(self,
 class CUDATorchDiffEqFixedAdamsIntegrator(CUDATorchDiffEqIntegrator):
 
     def __init__(self,
-                 stepper: BaseTimeStepper[cudaq_runtime.State] = None,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
                  **kwargs):
         super().__init__(stepper, solver='fixed_adams', **kwargs)
diff --git a/python/cudaq/dynamics/integrators/scipy_integrators.py b/python/cudaq/dynamics/integrators/scipy_integrators.py
index 3cc434ff320..48799fa4fa3 100644
--- a/python/cudaq/dynamics/integrators/scipy_integrators.py
+++ b/python/cudaq/dynamics/integrators/scipy_integrators.py
@@ -9,6 +9,7 @@
 from ..integrator import BaseTimeStepper, BaseIntegrator
 from .builtin_integrators import cuDensityMatTimeStepper, cuDensityMatSuperOpTimeStepper
 from ...mlir._mlir_libs._quakeDialects import cudaq_runtime
+from typing import Optional
 import numpy, math
 
 has_dynamics = True
@@ -31,7 +32,9 @@ class ScipyZvodeIntegrator(BaseIntegrator[cudaq_runtime.State]):
     rtol = 1e-6
     order = 12
 
-    def __init__(self, stepper: BaseTimeStepper[cudaq_runtime.State], **kwargs):
+    def __init__(self,
+                 stepper: Optional[BaseTimeStepper[cudaq_runtime.State]] = None,
+                 **kwargs):
         if not has_dynamics:
             raise ImportError(
                 'CUDA-Q is missing dynamics support. Please check your installation'
@@ -39,15 +42,11 @@ def __init__(self, stepper: BaseTimeStepper[cudaq_runtime.State], **kwargs):
         if not has_scipy:
             raise ImportError("scipy is required to use this integrator.")
         super().__init__(**kwargs)
-        self.stepper = stepper
+        # Store the user-provided stepper so it survives `set_system()` calls.
+        self._user_provided_stepper = stepper
         self.is_density_state = None
         self.batchSize = None
 
-    def __init__(self, **kwargs):
-        if not has_scipy:
-            raise ImportError("scipy is required to use this integrator.")
-        super().__init__(**kwargs)
-
     def compute_rhs(self, t, vec):
         state = cudaq_runtime.State.from_data(vec)
         state = bindings.initializeState(state, list(self.dimensions),
diff --git a/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py b/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py
index 390aa4ee4ad..89b0ef4fa3e 100644
--- a/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py
+++ b/python/tests/dynamics/integrators/test_evolve_dynamics_torch_integrators.py
@@ -84,6 +84,121 @@ def test_density_matrix_indexing():
     TestDensityMatrixIndexing().run_tests(CUDATorchDiffEqRK4Integrator)
 
 
+def test_user_provided_stepper_torch():
+    """Verify that Torch integrators use a user-provided stepper."""
+    from cudaq.dynamics.integrators.builtin_integrators import cuDensityMatTimeStepper
+    from cudaq.dynamics.integrator import BaseTimeStepper
+    from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import MatrixOperator, State
+    from cudaq.dynamics import nvqir_dynamics_bindings as bindings
+
+    N = 10
+    steps = np.linspace(0, 10, 101)
+    schedule = Schedule(steps, ["t"])
+    hamiltonian = number(0)
+    dimensions = {0: N}
+    decay_rate = 0.1
+    collapse_operators = [np.sqrt(decay_rate) * annihilate(0)]
+
+    bindings_schedule = bindings.Schedule(steps, ["t"])
+    # The actual stepper we'll use for integration, wrapped by our TrackingStepper below to verify that it's being called by the integrator.
+    real_stepper = cuDensityMatTimeStepper(
+        bindings_schedule, MatrixOperator(hamiltonian),
+        [MatrixOperator(op) for op in collapse_operators], [N], True)
+
+    class TrackingStepper(BaseTimeStepper[State]):
+
+        def __init__(self, stepper):
+            self.stepper = stepper
+            self.call_count = 0
+
+        def compute(self, state, t):
+            return self.stepper.compute(state, t)
+
+        def compute_inplace(self, state, t, out_state):
+            self.call_count += 1
+            self.stepper.compute_inplace(state, t, out_state)
+
+    tracking = TrackingStepper(real_stepper)
+    psi0_ = cp.zeros(N, dtype=cp.complex128)
+    psi0_[-1] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    evolution_result = cudaq.evolve(
+        hamiltonian,
+        dimensions,
+        schedule,
+        psi0,
+        observables=[hamiltonian],
+        collapse_operators=collapse_operators,
+        store_intermediate_results=cudaq.IntermediateResultSave.
+        EXPECTATION_VALUE,
+        # Torch integrator with a user-provided stepper.
+        integrator=CUDATorchDiffEqDopri5Integrator(stepper=tracking))
+
+    assert tracking.call_count > 0
+    expectation_values = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+    expected_answer = (N - 1) * np.exp(-decay_rate * steps)
+    np.testing.assert_allclose(expected_answer, expectation_values, 1e-3)
+
+
+def test_user_provided_stepper_torch_compute_only():
+    """Verify the fallback path: a stepper with only compute() (no compute_inplace)."""
+    from cudaq.dynamics.integrators.builtin_integrators import cuDensityMatTimeStepper
+    from cudaq.dynamics.integrator import BaseTimeStepper
+    from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import MatrixOperator, State
+    from cudaq.dynamics import nvqir_dynamics_bindings as bindings
+
+    N = 10
+    steps = np.linspace(0, 10, 101)
+    schedule = Schedule(steps, ["t"])
+    hamiltonian = number(0)
+    dimensions = {0: N}
+    decay_rate = 0.1
+    collapse_operators = [np.sqrt(decay_rate) * annihilate(0)]
+
+    bindings_schedule = bindings.Schedule(steps, ["t"])
+    real_stepper = cuDensityMatTimeStepper(
+        bindings_schedule, MatrixOperator(hamiltonian),
+        [MatrixOperator(op) for op in collapse_operators], [N], True)
+
+    class ComputeOnlyStepper(BaseTimeStepper[State]):
+
+        def __init__(self, stepper):
+            self.stepper = stepper
+            self.call_count = 0
+
+        def compute(self, state, t):
+            self.call_count += 1
+            return self.stepper.compute(state, t)
+
+    tracking = ComputeOnlyStepper(real_stepper)
+    psi0_ = cp.zeros(N, dtype=cp.complex128)
+    psi0_[-1] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    evolution_result = cudaq.evolve(
+        hamiltonian,
+        dimensions,
+        schedule,
+        psi0,
+        observables=[hamiltonian],
+        collapse_operators=collapse_operators,
+        store_intermediate_results=cudaq.IntermediateResultSave.
+        EXPECTATION_VALUE,
+        integrator=CUDATorchDiffEqDopri5Integrator(stepper=tracking))
+
+    assert tracking.call_count > 0
+    expectation_values = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+    expected_answer = (N - 1) * np.exp(-decay_rate * steps)
+    np.testing.assert_allclose(expected_answer, expectation_values, 1e-3)
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/dynamics/test_evolve_dynamics.py b/python/tests/dynamics/test_evolve_dynamics.py
index b327a76614f..c648ab28ed9 100644
--- a/python/tests/dynamics/test_evolve_dynamics.py
+++ b/python/tests/dynamics/test_evolve_dynamics.py
@@ -247,6 +247,63 @@ def test_evolve_from_data_random_density_matrix_preserved_cudm():
         err_msg="final state should match initial density matrix")
 
 
+def test_user_provided_stepper_scipy():
+    """Verify that ScipyZvodeIntegrator uses a user-provided stepper."""
+    from cudaq.dynamics.integrators.builtin_integrators import cuDensityMatTimeStepper
+    from cudaq.dynamics.integrator import BaseTimeStepper
+    from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime import MatrixOperator, State
+    from cudaq.dynamics import nvqir_dynamics_bindings as bindings
+
+    N = 10
+    steps = np.linspace(0, 10, 101)
+    schedule = Schedule(steps, ["t"])
+    hamiltonian = boson.number(0)
+    dimensions = {0: N}
+    decay_rate = 0.1
+    collapse_operators = [np.sqrt(decay_rate) * boson.annihilate(0)]
+
+    bindings_schedule = bindings.Schedule(steps, ["t"])
+    # The actual stepper that will be used for integration. We will wrap this with a `TrackingStepper` to verify that it is called during integration.
+    real_stepper = cuDensityMatTimeStepper(
+        bindings_schedule, MatrixOperator(hamiltonian),
+        [MatrixOperator(op) for op in collapse_operators], [N], True)
+
+    class TrackingStepper(BaseTimeStepper[State]):
+
+        def __init__(self, stepper):
+            self.stepper = stepper
+            # A counter to let us know that this stepper is actually being called during integration.
+            self.call_count = 0
+
+        def compute(self, state, t):
+            self.call_count += 1
+            return self.stepper.compute(state, t)
+
+    tracking = TrackingStepper(real_stepper)
+    psi0_ = cp.zeros(N, dtype=cp.complex128)
+    psi0_[-1] = 1.0
+    psi0 = cudaq.State.from_data(psi0_)
+
+    evolution_result = cudaq.evolve(
+        hamiltonian,
+        dimensions,
+        schedule,
+        psi0,
+        observables=[hamiltonian],
+        collapse_operators=collapse_operators,
+        store_intermediate_results=cudaq.IntermediateResultSave.
+        EXPECTATION_VALUE,
+        integrator=ScipyZvodeIntegrator(stepper=tracking))
+
+    assert tracking.call_count > 0
+    expectation_values = [
+        exp_vals[0].expectation()
+        for exp_vals in evolution_result.expectation_values()
+    ]
+    expected_answer = (N - 1) * np.exp(-decay_rate * steps)
+    np.testing.assert_allclose(expected_answer, expectation_values, 1e-3)
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)

From 736f4ef3b6b4bfaabf3333942271a34cedf547b9 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Fri, 24 Apr 2026 13:28:06 +1000
Subject: [PATCH 55/85] [Documentation] Add documentation for the
 `--cudaq-full-stack-trace` CLI option (#4386)

This also includes a note about the default behaviour of suppressing
tracebacks.

Resolved https://github.com/NVIDIA/cuda-quantum/issues/2421

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 docs/sphinx/api/languages/python_api.rst     |  2 +-
 docs/sphinx/using/basics/troubleshooting.rst | 25 +++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/api/languages/python_api.rst b/docs/sphinx/api/languages/python_api.rst
index 29644506d2d..a973997b7ce 100644
--- a/docs/sphinx/api/languages/python_api.rst
+++ b/docs/sphinx/api/languages/python_api.rst
@@ -76,6 +76,7 @@ Kernel Execution
 Backend Configuration
 =============================
 
+.. autofunction:: cudaq::parse_args
 .. autofunction:: cudaq::has_target
 .. autofunction:: cudaq::get_target
 .. autofunction:: cudaq::get_targets
@@ -494,4 +495,3 @@ Trajectory and Selection Types
 
 .. autoclass:: cudaq.ptsbe.KrausSelection
     :members:
-
diff --git a/docs/sphinx/using/basics/troubleshooting.rst b/docs/sphinx/using/basics/troubleshooting.rst
index 608e7086f60..5ca7bbcf2a2 100644
--- a/docs/sphinx/using/basics/troubleshooting.rst
+++ b/docs/sphinx/using/basics/troubleshooting.rst
@@ -38,4 +38,27 @@ variable. For any CUDA-Q executable, just prepend as follows:
 
       CUDAQ_DUMP_JIT_IR=1 ./a.out
       # or
-      CUDAQ_DUMP_JIT_IR=<output_filename> ./a.out
\ No newline at end of file
+      CUDAQ_DUMP_JIT_IR=<output_filename> ./a.out
+
+Python Stack-Traces
+++++++++++++++++++++++++
+
+When CUDA-Q parses Python command-line options via :func:`cudaq.parse_args`,
+Python stack-traces are suppressed by default to keep runtime errors concise.
+To show the full stack-trace for debugging, pass
+:code:`--cudaq-full-stack-trace` when invoking your script.
+
+.. code-block:: bash
+
+    python3 program.py --cudaq-full-stack-trace
+
+This flag can be combined with other CUDA-Q Python runtime options such as
+:code:`--target`, :code:`--target-option`, and :code:`--emulate`.
+
+.. code-block:: bash
+
+    python3 program.py --target nvidia --target-option fp64 --cudaq-full-stack-trace
+
+If your application parses CUDA-Q command-line arguments explicitly, call
+:func:`cudaq.parse_args` before running the rest of the program so the flag is
+recognized.

From 0239c98a0f03737dfb1f0a3a64eda22dac966e49 Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Thu, 23 Apr 2026 23:17:18 -0700
Subject: [PATCH 56/85] reduce per-PR workload (#4271)

# CUDA Quantum CI Pipeline

This document describes the CI pipeline for CUDA Quantum,
its structure, timing, and the tiered execution model.

## Pipeline Overview

The CI workflow (`ci.yml`) runs on every PR push
(via copy-pr-bot) and on merge queue checks. It builds
and tests CUDA Quantum across multiple platforms,
toolchains, and package formats.

```mermaid
graph TD
    META[metadata] --> DEVDEPS[devdeps]
    META --> WHEELDEPS[wheeldeps]
    META --> SRCBUILD[source_build]

    DEVDEPS --> CFG_DEV[config_devdeps]
    WHEELDEPS --> CFG_WHL[config_wheeldeps]
    SRCBUILD --> CFG_SRC[config_source_build]

    CFG_DEV --> BUILD_TEST[build_and_test]
    CFG_DEV --> CODE_COV[gen_code_coverage]
    CFG_DEV --> DOCKER[docker_image]

    CFG_WHL --> WHEELS[python_wheels]
    WHEELS --> METAPKG[python_metapackages]

    CFG_SRC --> BINARIES[binaries]

    BUILD_TEST --> CLEANUP[clean_up]
    DOCKER --> CLEANUP
    WHEELS --> CLEANUP

    style META fill:#76b900,color:#000
    style BINARIES fill:#1a1a1a,color:#76b900,stroke:#76b900
    style BUILD_TEST fill:#76b900,color:#000
    style WHEELS fill:#76b900,color:#000
    style CODE_COV fill:#333,color:#76b900,stroke:#76b900
    style METAPKG fill:#333,color:#76b900,stroke:#76b900
    style DOCKER fill:#333,color:#76b900,stroke:#76b900
    style SRCBUILD fill:#333,color:#76b900,stroke:#76b900
```

**Legend**: Bright green = runs on all tiers.
Dark with green text = skipped on PR tier.
Black = critical path (full tier only).

## CI Tiers

The pipeline supports two tiers to balance speed
and coverage:

| Trigger | Tier | Description |
|---------|------|-------------|
| `push` on `pull-request/*` | **pr** | Reduced validation |
| `merge_group` | **full** | Complete validation |
| `workflow_dispatch` (default) | **full** | Complete validation |
| `workflow_dispatch` `ci_level=pr` | **pr** | Manual override |

### What changes on PR tier

| Area | PR tier | Full tier |
|------|---------|-----------|
| Build and test (Debug) | amd64 x 3 + arm64 x 1 | amd64 + arm64 x 3 |
| Build and test (Python) | Skipped | amd64 + arm64 x 3 |
| Installer build + validation | Skipped | Full |
| Docker images | Skipped | amd64 + arm64 |
| Wheel validation OS | ubuntu + ubi9 | Full (5 images) |
| Code coverage | Skipped | Runs |
| Python metapackages | Skipped | Runs |

### How to force full CI on a PR

Use the Actions tab to run the workflow manually
with `ci_level=full` on the PR branch.

## Critical Path

### Full tier (~69 min)

The wall-clock time is dominated by the
**installer build chain**:

```text
metadata (7s)
  --> source_build cache loading (10 min)
    --> config_source_build (seconds)
      --> installer build on cpu32 (34 min)
        --> installer validation per OS (14-24 min)
```

With 5 OS images, the longest validation (fedora)
adds ~24 min, totaling ~69 min.

### PR tier (~40 min)

Installers and Docker images are skipped. arm64
runs clang16 only. The critical path shifts to
**build_and_test on arm64**:

```text
metadata (7s)
  --> devdeps parallel (3 min)
    --> config_devdeps (seconds)
      --> arm64 clang16 Debug build+test (36 min)
```

All other PR jobs (amd64 build_and_test, python
wheels) run in parallel and complete within this
window.

## Timing Reference

Typical durations from a cached PR run:

| Job | Configs | Per job | Total |
|-----|---------|---------|-------|
| devdeps | 6 | ~3 min | ~18 min |
| wheeldeps | 4 | ~1 min | ~4 min |
| source_build | 4 | ~10 min | ~40 min |
| build_and_test (Debug) | 6 | 27-36 min | ~180 min |
| build_and_test (Python) | 6 | 19-27 min | ~135 min |
| gen_code_coverage | 1 | ~20 min | ~20 min |
| Docker images (build) | 2 | 12-20 min | ~32 min |
| Docker images (validation) | 2 | 18-37 min | ~55 min |
| Python wheel (build) | 8 | ~10 min | ~80 min |
| Python wheel (validation) | up to 80 | 8-13 min | ~800 min |
| Installer (build) | 4 | 33-36 min | ~136 min |
| Installer (validation) | up to 12 | 14-24 min | ~200 min |

## Matrix Dimensions

### build_and_test

- **Platforms**: amd64, arm64
- **Toolchains**: clang16, gcc11, gcc12
- **MPI**: openmpi
- **Sub-jobs per config**: Debug (always), Python (full tier only)

### Python wheels

- **Platforms**: amd64, arm64
- **Python versions**: 3.11, 3.13
- **CUDA versions**: 12.6, 13.0
- **Validation OS** (per `validation_config.json`):
  - py3.11: ubuntu, debian, fedora, ubi8, ubi9
  - py3.13: fedora
- **Validation modes**: default, `--user`

### Installers (prebuilt binaries)

- **Platforms**: amd64, arm64
- **CUDA versions**: 12.6, 13.0
- **Validation OS** (varies by platform/CUDA):
  - amd64/CUDA12: ubuntu, debian, ubi9, opensuse, fedora
  - amd64/CUDA13: ubuntu, debian, opensuse, fedora
  - arm64/CUDA12: ubuntu, ubi9
  - arm64/CUDA13: ubuntu

## Runner Allocation

| Runner | Cores | Used by |
|--------|-------|---------|
| `linux-amd64-cpu8` | 8 | build_and_test, devdeps, wheels |
| `linux-amd64-cpu16` | 16 | Docker image builds |
| `linux-amd64-cpu32` | 32 | Installer/asset builds |
| `linux-arm64-cpu8` | 8 | arm64 builds, wheels |
| `linux-arm64-cpu16` | 16 | arm64 Docker images |
| `ubuntu-latest` | GH | metadata, config, cleanup |

## Caching Strategy

The pipeline uses three layers of caching:

1. **Docker BuildKit layer cache**: Stored in GHCR
   as `buildcache-cuda-quantum-*` images. Lookup
   order: PR-specific, base branch, main, nvidia.

2. **Compiler cache (ccache)**: 3GB limit, zstd.
   Stored in GHCR via ORAS. Pushed on main always;
   on PRs only if cache changed by >10%.

3. **Tar archive cache**: Docker images saved as
   tar in GitHub Actions cache. Enables dev env
   reuse across jobs in the same workflow run.

## Configuration Files

| File | Purpose |
|------|---------|
| `ci.yml` | Main CI orchestrator |
| `dev_environment.yml` | Build/cache dev images |
| `test_in_devenv.yml` | Build + test in containers |
| `python_wheels.yml` | Wheel build + validation |
| `prebuilt_binaries.yml` | Installer build + validation |
| `docker_images.yml` | Docker image build + validation |
| `generate_cc.yml` | Code coverage |
| `config/validation_config.json` | OS validation matrices |
| `.github/actions/ccache-*` | Compiler cache actions |

All workflow files are in `.github/workflows/`.

---------

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
Signed-off-by: mitchdz <mitch_dz@hotmail.com>
---
 .github/workflows/ci.yml                  |  66 +++++--
 .github/workflows/docker_images.yml       | 208 ++++++++++++++++++----
 .github/workflows/prebuilt_binaries.yml   | 118 +++++++++++-
 .github/workflows/python_metapackages.yml |  39 +++-
 .github/workflows/python_wheels.yml       |  92 +++++++++-
 .github/workflows/test_in_devenv.yml      |  74 +++++++-
 6 files changed, 527 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 02cbbd723b5..14773923ddf 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,14 @@ on:
       export_environment:
         type: boolean
         description: Export the build environment as tar artifact that can be imported with Docker.
+      ci_level:
+        type: choice
+        description: 'CI tier: pr runs a reduced validation matrix, full runs everything.'
+        options:
+          - auto
+          - pr
+          - full
+        default: auto
   # The GitHub application copy-pr-bot copies the source code for every pull request
   # into the repository. Approving such upstream pushes effectively marks code as trusted,
   # and is necessary to use the self-hosted NVIDIA runners.
@@ -40,6 +48,7 @@ jobs:
       llvm_commit: ${{ steps.repo_info.outputs.llvm_commit }}
       pybind11_commit: ${{ steps.repo_info.outputs.pybind11_commit }}
       platform_config: ${{ steps.config.outputs.platforms }}
+      ci_tier: ${{ steps.ci_tier.outputs.tier }}
 
     steps:
       - name: Checkout repository
@@ -81,6 +90,22 @@ jobs:
           echo "llvm_commit=$(git rev-parse @:./tpls/llvm)" >> $GITHUB_OUTPUT
           echo "pybind11_commit=$(git rev-parse @:./tpls/pybind11)" >> $GITHUB_OUTPUT
 
+      - id: ci_tier
+        name: Determine CI tier
+        run: |
+          requested="${{ inputs.ci_level || 'auto' }}"
+          if [ "$requested" != "auto" ]; then
+            tier="$requested"
+          elif [ "${{ github.event_name }}" == "push" ]; then
+            # PR pushes (pull-request/NNN branches via copy-pr-bot) get reduced CI
+            tier="pr"
+          else
+            # merge_group, workflow_dispatch with default auto get full CI
+            tier="full"
+          fi
+          echo "tier=$tier" >> $GITHUB_OUTPUT
+          echo "CI tier: $tier"
+
   devdeps:
     name: Load dependencies
     needs: metadata
@@ -138,6 +163,7 @@ jobs:
   source_build:
     name: Load source build cache
     needs: metadata
+    if: needs.metadata.outputs.ci_tier != 'pr'
     strategy:
       matrix:
         platform: [amd64, arm64]
@@ -192,7 +218,8 @@ jobs:
   config_source_build:
     name: Configure build (source_build)
     runs-on: ubuntu-latest
-    needs: source_build
+    needs: [metadata, source_build]
+    if: needs.metadata.outputs.ci_tier != 'pr'
     outputs:
       json: "${{ steps.read_json.outputs.result }}"
     steps:
@@ -203,17 +230,12 @@ jobs:
 
   build_and_test:
     name: Build and test
-    needs: config_devdeps
+    needs: [metadata, config_devdeps]
     strategy:
       matrix:
         platform: [amd64, arm64]
         toolchain: [clang16, gcc11, gcc12]
         mpi: [openmpi]
-        exclude:
-          - toolchain: llvm
-            mpi: mpich
-          - toolchain: clang16
-            mpi: mpich
       fail-fast: false
     uses: ./.github/workflows/test_in_devenv.yml
     with:
@@ -223,10 +245,16 @@ jobs:
       devdeps_cache: ${{ fromJson(needs.config_devdeps.outputs.json).cache_key[format('{0}-{1}', matrix.platform, matrix.toolchain)] }}
       devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-{1}', matrix.platform, matrix.toolchain)] }}
       export_environment: ${{ github.event_name == 'workflow_dispatch' && inputs.export_environment }}
+      # PR tier fake-passes the arm64/gcc builds (expensive, redundant with amd64/gcc coverage).
+      # Computing skip_build from the 3 real matrix axes keeps the auto-generated job name as
+      # "Build and test (<platform>, <toolchain>, <mpi>) / ..." so branch-protection required
+      # check names match. Adding a 4th matrix key would leak into the name and break gating.
+      skip_build: ${{ needs.metadata.outputs.ci_tier == 'pr' && matrix.platform == 'arm64' && (matrix.toolchain == 'gcc11' || matrix.toolchain == 'gcc12') }}
 
   gen_code_coverage:
     name: Gen code coverage
-    needs: config_devdeps
+    needs: [metadata, config_devdeps]
+    if: needs.metadata.outputs.ci_tier != 'pr'
     strategy:
       matrix:
         platform: [amd64]
@@ -242,9 +270,12 @@ jobs:
       devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-{1}', matrix.platform, matrix.toolchain)] }}
       export_environment: ${{ github.event_name == 'workflow_dispatch' && inputs.export_environment }}
 
+  # Docker images are packaging, not correctness. On PR tier this fake-passes
+  # (skip_build=true) so required checks remain green without doing the build;
+  # build_and_test covers code compilation and testing.
   docker_image:
     name: Create Docker images
-    needs: config_devdeps
+    needs: [metadata, config_devdeps]
     strategy:
       matrix:
         platform: [amd64, arm64]
@@ -259,10 +290,11 @@ jobs:
       devdeps_cache: ${{ fromJson(needs.config_devdeps.outputs.json).cache_key[format('{0}-gcc11', matrix.platform)] }}
       devdeps_archive: ${{ fromJson(needs.config_devdeps.outputs.json).tar_archive[format('{0}-gcc11', matrix.platform)] }}
       environment: ghcr-ci
+      skip_build: ${{ needs.metadata.outputs.ci_tier == 'pr' }}
 
   python_wheels:
     name: Create Python wheels
-    needs: config_wheeldeps
+    needs: [metadata, config_wheeldeps]
     strategy:
       matrix:
         platform: [amd64, arm64]
@@ -280,20 +312,29 @@ jobs:
       devdeps_image: ${{ fromJson(needs.config_wheeldeps.outputs.json).image_hash[format('{0}-cu{1}-python', matrix.platform, matrix.cuda_version)] }}
       devdeps_cache: ${{ fromJson(needs.config_wheeldeps.outputs.json).cache_key[format('{0}-cu{1}-python', matrix.platform, matrix.cuda_version)] }}
       devdeps_archive: ${{ fromJson(needs.config_wheeldeps.outputs.json).tar_archive[format('{0}-cu{1}-python', matrix.platform, matrix.cuda_version)] }}
+      skip_build: ${{ needs.metadata.outputs.ci_tier == 'pr' }}
 
   python_metapackages:
     name: Create Python metapackages
-    needs: python_wheels
+    needs: [metadata, python_wheels]
     uses: ./.github/workflows/python_metapackages.yml
     with:
       cudaq_version: ${{ needs.python_wheels.outputs.cudaq_version }}
       python_versions: "['3.11', '3.13']"
       cuda_versions: "['', '12.6', '13.0']"
       wheel_artifacts: 'pycudaq-*'
+      skip_build: ${{ needs.metadata.outputs.ci_tier == 'pr' }}
 
+  # Installer builds are packaging, not correctness. On PR tier this fake-passes
+  # (skip_build=true) so required checks report status; code correctness is
+  # validated by build_and_test.
   binaries:
     name: Create CUDA Quantum installer
     needs: [metadata, config_source_build]
+    # Run even when config_source_build is skipped on PR tier. The child
+    # workflow guards all real build steps on !inputs.skip_build, so the
+    # unused build_cache input is fine.
+    if: ${{ always() && needs.metadata.result == 'success' }}
     strategy:
       matrix:
         platform: [amd64, arm64]
@@ -308,7 +349,8 @@ jobs:
       platform_base_image: ${{ fromJson(needs.metadata.outputs.platform_config)[format('{0}', matrix.platform)].minimal_base_image }}
       build_config_id: cu${{ matrix.cuda_version }}-llvm
       cuda_version: ${{ matrix.cuda_version }}
-      build_cache:  ${{ fromJson(needs.config_source_build.outputs.json).build_cache[format('{0}-cu{1}-installer', matrix.platform, matrix.cuda_version)] }}
+      build_cache: ${{ needs.config_source_build.result == 'success' && fromJson(needs.config_source_build.outputs.json).build_cache[format('{0}-cu{1}-installer', matrix.platform, matrix.cuda_version)] || '' }}
+      skip_build: ${{ needs.metadata.outputs.ci_tier == 'pr' }}
 
   clean_up:
     name: Prepare cache clean-up
diff --git a/.github/workflows/docker_images.yml b/.github/workflows/docker_images.yml
index 489313ffb21..c7206557c74 100644
--- a/.github/workflows/docker_images.yml
+++ b/.github/workflows/docker_images.yml
@@ -27,6 +27,13 @@ on:
       environment:
         required: false
         type: string
+      skip_build:
+        type: boolean
+        required: false
+        default: false
+        description: >
+          When true, jobs run to report success for required-check gating
+          but skip the docker builds, tests, and image pushes.
     secrets:
       NGC_CREDENTIALS:
         description: 'Credentials for deployments to NGC.'
@@ -45,7 +52,10 @@ jobs:
     permissions: {}
 
     outputs:
-      runner: ${{ steps.info.outputs.runner }}
+      # On skip_build (PR tier fake-pass), force every downstream job onto
+      # ubuntu-latest so required checks report success without consuming
+      # self-hosted runner capacity or waiting in the queue.
+      runner: ${{ inputs.skip_build && 'ubuntu-latest' || steps.info.outputs.runner }}
       platform_tag: ${{ steps.info.outputs.platform_tag }}
       build_docs: ${{ steps.info.outputs.build_docs }}
       push_to_ngc: ${{ steps.info.outputs.push_to_ngc }}
@@ -105,14 +115,27 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Log in to the container registry
-        if: inputs.environment && vars.registry
+        if: ${{ !inputs.skip_build && inputs.environment && vars.registry }}
         uses: docker/login-action@v3
         with:
           registry: ${{ vars.registry }}
@@ -121,6 +144,7 @@ jobs:
 
       - name: Determine build arguments
         id: build_info
+        if: ${{ !inputs.skip_build }}
         run: |
           repo_owner=${{ github.repository_owner }}
           registry=${{ vars.registry || 'localhost:5000' }}
@@ -142,6 +166,7 @@ jobs:
 
       - name: Extract metadata
         id: metadata
+        if: ${{ !inputs.skip_build }}
         uses: docker/metadata-action@v5
         with:
           images: ${{ steps.build_info.outputs.image_name }}
@@ -154,10 +179,12 @@ jobs:
             org.opencontainers.image.description=Open MPI dependencies of CUDA Quantum
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -168,13 +195,14 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
-        if: needs.metadata.outputs.runner != 'ubuntu-latest'
+        if: ${{ !inputs.skip_build && needs.metadata.outputs.runner != 'ubuntu-latest' }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Build Open MPI
         id: docker_build
+        if: ${{ !inputs.skip_build }}
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -191,20 +219,20 @@ jobs:
           outputs: ${{ steps.build_info.outputs.docker_output }}
 
       - name: Install Cosign
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         uses: sigstore/cosign-installer@v3.3.0
         with:
           cosign-release: 'v2.2.2'
 
       - name: Sign image with GitHub OIDC Token
-        if: inputs.environment && false # Signing is disabled as long as the package is private, since we can't clean up signatures in that case
+        if: ${{ !inputs.skip_build && inputs.environment && false }}
         env:
           DIGEST: ${{ steps.docker_build.outputs.digest }}
           TAGS: ${{ steps.metadata.outputs.tags }}
         run: cosign sign --yes --recursive "${TAGS}@${DIGEST}"
 
       - name: Cache cuda-quantum image
-        if: steps.build_info.outputs.tar_cache && steps.build_info.outputs.tar_archive
+        if: ${{ !inputs.skip_build && steps.build_info.outputs.tar_cache && steps.build_info.outputs.tar_archive }}
         uses: actions/cache/save@v4
         with:
           path: ${{ steps.build_info.outputs.tar_archive }}
@@ -242,13 +270,27 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -256,7 +298,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Log in to the container registry
-        if: inputs.environment && vars.registry
+        if: ${{ !inputs.skip_build && inputs.environment && vars.registry }}
         uses: docker/login-action@v3
         with:
           registry: ${{ vars.registry }}
@@ -264,7 +306,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Restore build environment
-        if: inputs.devdeps_cache && inputs.devdeps_archive
+        if: ${{ !inputs.skip_build && inputs.devdeps_cache && inputs.devdeps_archive }}
         id: restore
         uses: actions/cache/restore@v4
         with:
@@ -274,6 +316,7 @@ jobs:
 
       - name: Load prerequisites
         id: prereqs
+        if: ${{ !inputs.skip_build }}
         run: |
           if ${{ steps.restore.outcome != 'skipped' }}; then
             load_output=`docker load --input "${{ inputs.devdeps_archive }}"`
@@ -311,10 +354,12 @@ jobs:
           fi
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -325,15 +370,17 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
-        if: needs.metadata.outputs.runner != 'ubuntu-latest'
+        if: ${{ !inputs.skip_build && needs.metadata.outputs.runner != 'ubuntu-latest' }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Set up ccache and ORAS
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-setup
 
       - name: Restore ccache from GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-restore
         with:
           cache-key: ${{ needs.metadata.outputs.platform_tag }}-cudaqdev
@@ -341,6 +388,7 @@ jobs:
 
       - name: Extract metadata
         id: metadata
+        if: ${{ !inputs.skip_build }}
         uses: docker/metadata-action@v5
         with:
           images: ${{ steps.prereqs.outputs.image_name }}
@@ -352,6 +400,7 @@ jobs:
 
       - name: Build cuda-quantum-dev image (debug)
         id: docker_build
+        if: ${{ !inputs.skip_build }}
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -371,6 +420,7 @@ jobs:
       - name: Extract ccache from build
         timeout-minutes: 15
         continue-on-error: true
+        if: ${{ !inputs.skip_build }}
         run: |
           mkdir -p /tmp/ccache-export
           docker buildx build \
@@ -389,6 +439,7 @@ jobs:
           fi
 
       - name: Push ccache to GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-push
         with:
           cache-key: ${{ needs.metadata.outputs.platform_tag }}-cudaqdev
@@ -396,6 +447,7 @@ jobs:
           registry-token: ${{ github.token }}
 
       - name: Write matrix outputs
+        if: ${{ !inputs.skip_build }}
         uses: cloudposse/github-action-matrix-outputs-write@1.0.0
         with:
           matrix-step-name: docker_images
@@ -405,20 +457,20 @@ jobs:
             image_tag: ${{ steps.metadata.outputs.tags }}
 
       - name: Install Cosign
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         uses: sigstore/cosign-installer@v3.3.0
         with:
           cosign-release: 'v2.2.2'
 
       - name: Sign image with GitHub OIDC Token
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         env:
           DIGEST: ${{ steps.docker_build.outputs.digest }}
           TAGS: ${{ steps.metadata.outputs.tags }}
         run: cosign sign --yes --recursive "${TAGS}@${DIGEST}"
 
       - name: Cache cuda-quantum-dev image
-        if: steps.prereqs.outputs.tar_cache && steps.prereqs.outputs.tar_archive
+        if: ${{ !inputs.skip_build && steps.prereqs.outputs.tar_cache && steps.prereqs.outputs.tar_archive }}
         uses: actions/cache/save@v4
         with:
           path: ${{ steps.prereqs.outputs.tar_archive }}
@@ -458,21 +510,35 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ github.token }}
-  
+
       - name: Log in to the container registry
-        if: inputs.environment && vars.registry
+        if: ${{ !inputs.skip_build && inputs.environment && vars.registry }}
         uses: docker/login-action@v3
         with:
           registry: ${{ vars.registry }}
@@ -480,7 +546,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Restore build environment
-        if: inputs.devdeps_cache && inputs.devdeps_archive
+        if: ${{ !inputs.skip_build && inputs.devdeps_cache && inputs.devdeps_archive }}
         id: restore_devdeps
         uses: actions/cache/restore@v4
         with:
@@ -490,7 +556,7 @@ jobs:
 
       - name: Restore Open MPI dependencies
         id: restore_openmpi
-        if: needs.ompi_image.outputs.tar_cache && needs.ompi_image.outputs.tar_archive
+        if: ${{ !inputs.skip_build && needs.ompi_image.outputs.tar_cache && needs.ompi_image.outputs.tar_archive }}
         uses: actions/cache/restore@v4
         with:
           path: ${{ needs.ompi_image.outputs.tar_archive }}
@@ -499,6 +565,7 @@ jobs:
 
       - name: Load prerequisites
         id: prereqs
+        if: ${{ !inputs.skip_build }}
         run: |
           if ${{ needs.ompi_image.result == 'skipped' }}; then
             base_image=ghcr.io/nvidia/ubuntu:24.04
@@ -578,10 +645,12 @@ jobs:
           fi
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -592,15 +661,17 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
-        if: needs.metadata.outputs.runner != 'ubuntu-latest'
+        if: ${{ !inputs.skip_build && needs.metadata.outputs.runner != 'ubuntu-latest' }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Set up ccache and ORAS
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-setup
 
       - name: Restore ccache from GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-restore
         with:
           cache-key: ${{ needs.metadata.outputs.platform_tag }}-cudaq-release
@@ -608,6 +679,7 @@ jobs:
 
       - name: Extract cuda-quantum-dev metadata
         id: cudaqdev_metadata
+        if: ${{ !inputs.skip_build }}
         uses: docker/metadata-action@v5
         with:
           images: ${{ steps.prereqs.outputs.dev_image_name }}
@@ -621,7 +693,7 @@ jobs:
 
       - name: Build cuda-quantum-dev image (release)
         id: release_build
-        if: success() && !cancelled()
+        if: ${{ !inputs.skip_build && success() && !cancelled() }}
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -640,6 +712,7 @@ jobs:
       - name: Extract ccache from release build
         timeout-minutes: 15
         continue-on-error: true
+        if: ${{ !inputs.skip_build }}
         run: |
           mkdir -p /tmp/ccache-export
           docker buildx build \
@@ -658,6 +731,7 @@ jobs:
           fi
 
       - name: Push ccache to GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-push
         with:
           cache-key: ${{ needs.metadata.outputs.platform_tag }}-cudaq-release
@@ -665,20 +739,20 @@ jobs:
           registry-token: ${{ github.token }}
 
       - name: Install Cosign
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         uses: sigstore/cosign-installer@v3.3.0
         with:
           cosign-release: 'v2.2.2'
 
       - name: Sign image with GitHub OIDC Token
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         env:
           DIGEST: ${{ steps.release_build.outputs.digest }}
           TAGS: ${{ steps.cudaqdev_metadata.outputs.tags }}
         run: cosign sign --yes --recursive "${TAGS}@${DIGEST}"
 
       - name: Log in to NGC
-        if: needs.metadata.outputs.push_to_ngc == 'true'
+        if: ${{ !inputs.skip_build && needs.metadata.outputs.push_to_ngc == 'true' }}
         uses: docker/login-action@v3
         with:
           registry: nvcr.io
@@ -687,6 +761,7 @@ jobs:
 
       - name: Extract cuda-quantum metadata
         id: cudaq_metadata
+        if: ${{ !inputs.skip_build }}
         uses: docker/metadata-action@v5
         with:
           images: ${{ steps.prereqs.outputs.image_name }}
@@ -699,13 +774,13 @@ jobs:
             org.opencontainers.image.description=${{ steps.prereqs.outputs.image_description }}
 
       - name: Configure credentials
-        if: needs.metadata.outputs.platform_tag == ''
+        if: ${{ !inputs.skip_build && needs.metadata.outputs.platform_tag == '' }}
         run: |
           docker run --privileged multiarch/qemu-user-static:latest --reset -p yes --credential yes
 
       - name: Build cuda-quantum image
         id: cudaq_build
-        if: success() && !cancelled()
+        if: ${{ !inputs.skip_build && success() && !cancelled() }}
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -723,6 +798,7 @@ jobs:
           outputs: ${{ steps.prereqs.outputs.docker_output }}
 
       - name: Write matrix outputs
+        if: ${{ !inputs.skip_build }}
         uses: cloudposse/github-action-matrix-outputs-write@1.0.0
         with:
           matrix-step-name: docker_images
@@ -732,28 +808,28 @@ jobs:
             image_tag: ${{ steps.cudaq_metadata.outputs.tags }}
 
       - name: Cache cuda-quantum image
-        if: steps.prereqs.outputs.tar_cache && steps.prereqs.outputs.tar_archive
+        if: ${{ !inputs.skip_build && steps.prereqs.outputs.tar_cache && steps.prereqs.outputs.tar_archive }}
         uses: actions/cache/save@v4
         with:
           path: ${{ steps.prereqs.outputs.tar_archive }}
           key: ${{ steps.prereqs.outputs.tar_cache }}
 
       - name: Sign image with GitHub OIDC Token
-        if: inputs.environment && needs.metadata.outputs.push_to_ngc != 'true'
+        if: ${{ !inputs.skip_build && inputs.environment && needs.metadata.outputs.push_to_ngc != 'true' }}
         env:
           DIGEST: ${{ steps.cudaq_build.outputs.digest }}
           TAGS: ${{ steps.cudaq_metadata.outputs.tags }}
         run: cosign sign --yes --recursive "${TAGS}@${DIGEST}"
 
       - name: Install NGC CLI
-        if: inputs.environment && needs.metadata.outputs.push_to_ngc == 'true'
+        if: ${{ !inputs.skip_build && inputs.environment && needs.metadata.outputs.push_to_ngc == 'true' }}
         uses: ./.github/actions/install-ngc-cli
         with:
           version: 3.31.0
           checksum: b715e503e2c0b44814a51f330eafd605f5d240ea0987bf615700d359c993f138
 
       - name: Sign image with NGC CLI
-        if: inputs.environment && needs.metadata.outputs.push_to_ngc == 'true'
+        if: ${{ !inputs.skip_build && inputs.environment && needs.metadata.outputs.push_to_ngc == 'true' }}
         env:
           TAGS: ${{ steps.cudaq_metadata.outputs.tags }}
           NGC_CLI_API_KEY: ${{ secrets.NGC_CREDENTIALS }}
@@ -780,7 +856,20 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -788,7 +877,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Log in to the container registry
-        if: inputs.environment && vars.registry
+        if: ${{ !inputs.skip_build && inputs.environment && vars.registry }}
         uses: docker/login-action@v3
         with:
           registry: ${{ vars.registry }}
@@ -797,7 +886,7 @@ jobs:
 
       - name: Restore CUDA Quantum build
         id: restore_cudaqdev
-        if: needs.cudaqdev_image.outputs.tar_cache && needs.cudaqdev_image.outputs.tar_archive
+        if: ${{ !inputs.skip_build && needs.cudaqdev_image.outputs.tar_cache && needs.cudaqdev_image.outputs.tar_archive }}
         uses: actions/cache/restore@v4
         with:
           path: ${{ needs.cudaqdev_image.outputs.tar_archive }}
@@ -806,7 +895,7 @@ jobs:
 
       - name: Restore CUDA Quantum image
         id: restore_cudaq
-        if: needs.cudaq_image.outputs.tar_cache && needs.cudaq_image.outputs.tar_archive
+        if: ${{ !inputs.skip_build && needs.cudaq_image.outputs.tar_cache && needs.cudaq_image.outputs.tar_archive }}
         uses: actions/cache/restore@v4
         with:
           path: ${{ needs.cudaq_image.outputs.tar_archive }}
@@ -815,6 +904,7 @@ jobs:
 
       - name: Build documentation
         id: docs_build
+        if: ${{ !inputs.skip_build }}
         run: |
           if ${{ steps.restore_cudaq.outcome != 'skipped' }}; then
             load_output=`docker load --input "${{ needs.cudaq_image.outputs.tar_archive }}"`
@@ -854,7 +944,7 @@ jobs:
           echo "json=$(echo $json)" >> $GITHUB_OUTPUT
 
       - name: Upload build artifacts
-        if: failure()
+        if: ${{ !inputs.skip_build && failure() }}
         uses: actions/upload-artifact@v4
         with:
           name: build
@@ -862,7 +952,7 @@ jobs:
           retention-days: 1
 
       - name: Upload documentation
-        if: success()
+        if: ${{ !inputs.skip_build && success() }}
         uses: actions/upload-artifact@v4
         with:
           name: cuda_quantum_docs # changing the artifact name requires updating other workflows
@@ -871,7 +961,7 @@ jobs:
           if-no-files-found: error
 
       - name: Spell check HTML documentation
-        if: success()
+        if: ${{ !inputs.skip_build && success() }}
         continue-on-error: true # to be removed once we update all docs for this check to pass
         uses: rojopolis/spellcheck-github-actions@0.30.0
         with:
@@ -899,7 +989,20 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -907,7 +1010,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Log in to the container registry
-        if: inputs.environment && vars.registry
+        if: ${{ !inputs.skip_build && inputs.environment && vars.registry }}
         uses: docker/login-action@v3
         with:
           registry: ${{ vars.registry }}
@@ -916,7 +1019,7 @@ jobs:
 
       - name: Load cuda-quantum image
         id: restore
-        if: needs.cudaq_image.outputs.tar_cache && needs.cudaq_image.outputs.tar_archive
+        if: ${{ !inputs.skip_build && needs.cudaq_image.outputs.tar_cache && needs.cudaq_image.outputs.tar_archive }}
         uses: actions/cache/restore@v4
         with:
           path: ${{ needs.cudaq_image.outputs.tar_archive }}
@@ -924,6 +1027,7 @@ jobs:
           fail-on-cache-miss: true
 
       - name: Validate cuda-quantum image
+        if: ${{ !inputs.skip_build }}
         run: |
           if ${{ steps.restore.outcome != 'skipped' }}; then
             load_output=`docker load --input "${{ needs.cudaq_image.outputs.tar_archive }}"`
@@ -954,7 +1058,7 @@ jobs:
 
       - name: Create job summary
         id: job_summary
-        if: always() && !cancelled()
+        if: ${{ !inputs.skip_build && always() && !cancelled() }}
         run: |
           cuda_major=`echo ${{ inputs.cuda_version }} | cut -d . -f1`
           platform_id=${{ needs.metadata.outputs.platform_tag }}
@@ -972,7 +1076,7 @@ jobs:
           fi
 
       - name: Upload job summary
-        if: steps.job_summary.outputs.validation_summary != ''
+        if: ${{ !inputs.skip_build && steps.job_summary.outputs.validation_summary != '' }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.job_summary.outputs.artifact_id }}_validation
@@ -996,8 +1100,21 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Create build info
         id: staging
+        if: ${{ !inputs.skip_build }}
         run: |
           cudaqdev_hash=${{ needs.cudaqdev_image.outputs.image_hash }}
           cudaq_hash=${{ needs.cudaq_image.outputs.image_hash }}
@@ -1016,6 +1133,7 @@ jobs:
           cat .github/workflows/config/gitlab_commits.txt >> "$info_file"
 
       - name: Upload build info
+        if: ${{ !inputs.skip_build }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.staging.outputs.artifact_name }} # changing the artifact name requires updating other workflows
@@ -1031,8 +1149,21 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Save cache keys
         id: workflow_inputs
+        if: ${{ !inputs.skip_build }}
         run: |
           keys=${{ needs.cudaqdev_image.outputs.tar_cache }}
           keys+=" ${{ needs.cudaq_image.outputs.tar_cache }}"
@@ -1040,7 +1171,8 @@ jobs:
           echo "$keys" >> cache_keys.txt
           echo "artifact_name=${{ needs.validation.outputs.artifact_id }}_cache_keys" >> $GITHUB_OUTPUT
 
-      - uses: actions/upload-artifact@v4
+      - if: ${{ !inputs.skip_build }}
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.workflow_inputs.outputs.artifact_name }}
           path: cache_keys.txt
diff --git a/.github/workflows/prebuilt_binaries.yml b/.github/workflows/prebuilt_binaries.yml
index 952a8298a37..aa308c71957 100644
--- a/.github/workflows/prebuilt_binaries.yml
+++ b/.github/workflows/prebuilt_binaries.yml
@@ -24,6 +24,13 @@ on:
       environment:
         type: string
         required: false
+      skip_build:
+        type: boolean
+        required: false
+        default: false
+        description: >
+          When true, jobs run to report success for required-check gating
+          but skip the docker builds, tests, and image pushes.
     secrets:
       DOCKERHUB_USERNAME:
         required: true
@@ -63,13 +70,27 @@ jobs:
         with:
           submodules: true
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Login to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -77,6 +98,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Configure build
+        if: ${{ !inputs.skip_build }}
         id: config
         run: |
           platform_tag=`echo ${{ inputs.platform }} | sed 's/linux\///g' | tr -d ' '`
@@ -122,10 +144,12 @@ jobs:
           } >> $GITHUB_OUTPUT
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -136,20 +160,24 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
+        if: ${{ !inputs.skip_build }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Set up ccache and ORAS
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-setup
 
       - name: Restore ccache from GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-restore
         with:
           cache-key: ${{ steps.config.outputs.ccache_key }}
           registry-token: ${{ github.token }}
 
       - name: Extract metadata
+        if: ${{ !inputs.skip_build }}
         id: metadata
         uses: docker/metadata-action@v5
         with:
@@ -167,6 +195,7 @@ jobs:
             org.opencontainers.image.description=Pre-built Linux binaries for CUDA Quantum
 
       - name: Build assets
+        if: ${{ !inputs.skip_build }}
         id: docker_build
         uses: docker/build-push-action@v5
         with:
@@ -187,6 +216,7 @@ jobs:
           push: true
 
       - name: Extract ccache from build
+        if: ${{ !inputs.skip_build }}
         timeout-minutes: 15
         continue-on-error: true
         run: |
@@ -207,6 +237,7 @@ jobs:
           fi
 
       - name: Push ccache to GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-push
         with:
           cache-key: ${{ steps.config.outputs.ccache_key }}
@@ -214,19 +245,20 @@ jobs:
           registry-token: ${{ github.token }}
 
       - name: Install Cosign
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         uses: sigstore/cosign-installer@v3.3.0
         with:
           cosign-release: 'v2.2.2'
 
       - name: Sign image with GitHub OIDC Token
-        if: inputs.environment
+        if: ${{ !inputs.skip_build && inputs.environment }}
         env:
           DIGEST: ${{ steps.docker_build.outputs.digest }}
           TAGS: ${{ steps.metadata.outputs.tags }}
         run: cosign sign --yes --recursive "${TAGS}@${DIGEST}"
 
       - name: Build installer
+        if: ${{ !inputs.skip_build }}
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -237,6 +269,7 @@ jobs:
           outputs: type=local,dest=/tmp/install
 
       - name: Upload installer
+        if: ${{ !inputs.skip_build }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.config.outputs.artifact_name }}
@@ -258,7 +291,20 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Configure build
+        if: ${{ !inputs.skip_build }}
         id: cache
         run: |
           build_cache=`echo "${{ inputs.build_cache }}" | sed 's/prereqs/openmpi/g'`
@@ -267,10 +313,12 @@ jobs:
           echo "build_cache=$build_cache" >> $GITHUB_OUTPUT
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -280,17 +328,20 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
+        if: ${{ !inputs.skip_build }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -298,6 +349,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Build OpenMPI
+        if: ${{ !inputs.skip_build }}
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -325,6 +377,18 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - id: config
         run: |
           cuda_major=`echo ${{ inputs.cuda_version }} | cut -d . -f1`
@@ -353,13 +417,27 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -367,12 +445,14 @@ jobs:
           password: ${{ github.token }}
 
       - name: Load cuda-quantum installer
+        if: ${{ !inputs.skip_build }}
         uses: actions/download-artifact@v4
         with:
           name: ${{ needs.build_installer.outputs.artifact_name }}
           path: /tmp/install
 
       - name: Install in clean environment
+        if: ${{ !inputs.skip_build }}
         run: |
           config='${{ needs.create_test_config.outputs.json }}'
           system_config=`echo $config | jq -r '."${{ matrix.os_image }}"'`
@@ -394,6 +474,7 @@ jobs:
             $(echo $additional_build_args | xargs | sed "s/[^ ]*/--build-arg &/g")
 
       - name: Sanity checks
+        if: ${{ !inputs.skip_build }}
         run: |
           docker run --rm -dit --name cuda-quantum validation:local
           (docker exec cuda-quantum bash -lc "set -o pipefail && bash validate.sh | tee /tmp/validation.out") && passed=true || passed=false
@@ -406,6 +487,7 @@ jobs:
           fi
 
       - name: Side-by-side installation of Python support
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/run-in-docker
         with:
           image: validation:local
@@ -440,6 +522,7 @@ jobs:
             fi
 
       - name: Additional validation (MPI and uninstall)
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/run-in-docker
         with:
           image: validation:local
@@ -539,7 +622,7 @@ jobs:
 
       - name: Create job summary
         id: job_summary
-        if: always() && !cancelled()
+        if: ${{ !inputs.skip_build && always() && !cancelled() }}
         run: |
           cuda_major=`echo ${{ inputs.cuda_version }} | cut -d . -f1`
           platform_id=`echo "${{ inputs.platform }}" | sed 's/linux\///g' | tr -d ' ' | tr ',' -`
@@ -559,7 +642,7 @@ jobs:
           fi
 
       - name: Upload job summary
-        if: steps.job_summary.outputs.validation_summary != ''
+        if: ${{ !inputs.skip_build && steps.job_summary.outputs.validation_summary != '' }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.job_summary.outputs.artifact_id }}_${{ steps.job_summary.outputs.os_id }}_validation
@@ -579,7 +662,20 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Create build info
+        if: ${{ !inputs.skip_build }}
         id: staging
         run: |
           image_hash=${{ needs.build_installer.outputs.image_hash }}
@@ -600,6 +696,7 @@ jobs:
           cat .github/workflows/config/gitlab_commits.txt >> "$info_file"
 
       - name: Upload build info
+        if: ${{ !inputs.skip_build }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.staging.outputs.artifact_name }} # changing the artifact name requires updating other workflows
@@ -615,7 +712,20 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Delete artifacts
+        if: ${{ !inputs.skip_build }}
         uses: actions/github-script@v7
         with:
           script: |
diff --git a/.github/workflows/python_metapackages.yml b/.github/workflows/python_metapackages.yml
index 9e5129a237d..97e661d89f9 100644
--- a/.github/workflows/python_metapackages.yml
+++ b/.github/workflows/python_metapackages.yml
@@ -20,7 +20,14 @@ on:
       github_commit:
         required: false
         type: string
-        description: Optional argument to set the GitHub commit to use for the build. 
+        description: Optional argument to set the GitHub commit to use for the build.
+      skip_build:
+        type: boolean
+        required: false
+        default: false
+        description: >
+          When true, jobs run to report success for required-check gating
+          but skip the docker builds, tests, and image pushes.
     outputs:
       artifact_name:
         description: "The name of the artifact that contains the built metapackages."
@@ -44,7 +51,20 @@ jobs:
         with:
           ref: "${{ inputs.github_commit || '' }}"
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Build metapackages
+        if: ${{ !inputs.skip_build }}
         id: metapackage_build
         run: |
             sed -i "s/README.md.in/README.md/g" python/metapackages/pyproject.toml
@@ -90,6 +110,7 @@ jobs:
             echo "artifact_name=cudaq-metapackage-${{ inputs.cudaq_version }}" >> $GITHUB_OUTPUT
 
       - name: Upload metapackages
+        if: ${{ !inputs.skip_build }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.metapackage_build.outputs.artifact_name }}
@@ -118,7 +139,20 @@ jobs:
       fail-fast: false
 
     steps:
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Load metapackages
+        if: ${{ !inputs.skip_build }}
         uses: actions/download-artifact@v4
         with:
           pattern: ${{ needs.build_metapackages.outputs.artifact_name }}
@@ -126,6 +160,7 @@ jobs:
           merge-multiple: true
 
       - name: Load wheels
+        if: ${{ !inputs.skip_build }}
         uses: actions/download-artifact@v4
         with:
           pattern: ${{ inputs.wheel_artifacts }}
@@ -133,6 +168,7 @@ jobs:
           merge-multiple: true
 
       - name: Test installation
+        if: ${{ !inputs.skip_build }}
         run: |
           mkdir -p /tmp/packages 
           mv /tmp/wheels/* /tmp/packages && mv /tmp/metapackages/* /tmp/packages
@@ -191,6 +227,7 @@ jobs:
           fi
 
       - name: Test installation error
+        if: ${{ !inputs.skip_build }}
         run: |
           python=python${{ matrix.python_version }}
 
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
index 167ecb865af..5491f629df2 100644
--- a/.github/workflows/python_wheels.yml
+++ b/.github/workflows/python_wheels.yml
@@ -25,6 +25,13 @@ on:
         required: false
         type: boolean
         default: false
+      skip_build:
+        type: boolean
+        required: false
+        default: false
+        description: >
+          When true, jobs run to report success for required-check gating
+          but skip the docker builds, tests, and image pushes.
     outputs:
       cudaq_version:
         description: "The version of the built wheels."
@@ -40,7 +47,9 @@ name: Python wheels
 jobs:
   build_wheel:
     name: Build Python ${{ inputs.python_version }} wheel
-    runs-on: ${{ (contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8' }}
+    # See test_in_devenv.yml for the rationale: on PR tier fake-pass we bypass
+    # self-hosted runners so the required check reports success quickly.
+    runs-on: ${{ inputs.skip_build && 'ubuntu-latest' || ((contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8') }}
     permissions:
       contents: read
       packages: write
@@ -61,13 +70,27 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to DockerHub
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }}
 
       - name: Login to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -75,7 +98,7 @@ jobs:
           password: ${{ github.token }}
 
       - name: Restore build environment
-        if: inputs.devdeps_cache && inputs.devdeps_archive
+        if: ${{ !inputs.skip_build && inputs.devdeps_cache && inputs.devdeps_archive }}
         id: restore
         uses: actions/cache/restore@v4
         with:
@@ -84,6 +107,7 @@ jobs:
           fail-on-cache-miss: true
 
       - name: Load prerequisites
+        if: ${{ !inputs.skip_build }}
         id: prereqs
         run: |
           if ${{ steps.restore.outcome != 'skipped' }}; then
@@ -122,25 +146,30 @@ jobs:
           echo "artifact_name=pycudaq-$cache_id" >> $GITHUB_OUTPUT
 
       - name: Set up ccache and ORAS
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-setup
 
       - name: Compute platform ID
+        if: ${{ !inputs.skip_build }}
         id: platform_meta
         run: |
           platform_id=$(echo "${{ inputs.platform }}" | sed 's/linux\///g' | tr -d ' ')
           echo "platform_id=$platform_id" >> $GITHUB_OUTPUT
 
       - name: Restore ccache from GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-restore
         with:
           cache-key: ${{ steps.platform_meta.outputs.platform_id }}-wheel
           registry-token: ${{ github.token }}
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -151,11 +180,13 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
+        if: ${{ !inputs.skip_build }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Build wheel
+        if: ${{ !inputs.skip_build }}
         id: wheel_build
         uses: docker/build-push-action@v5
         with:
@@ -170,6 +201,7 @@ jobs:
           outputs: ${{ steps.prereqs.outputs.docker_output }}
 
       - name: Extract ccache from build
+        if: ${{ !inputs.skip_build }}
         timeout-minutes: 15
         continue-on-error: true
         run: |
@@ -193,6 +225,7 @@ jobs:
           fi
 
       - name: Push ccache to GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-push
         with:
           cache-key: ${{ steps.platform_meta.outputs.platform_id }}-wheel
@@ -200,6 +233,7 @@ jobs:
           registry-token: ${{ github.token }}
 
       - name: Upload wheel
+        if: ${{ !inputs.skip_build }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.prereqs.outputs.artifact_name }}
@@ -220,16 +254,29 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - id: config
         run: |
           python_config=`cat .github/workflows/config/validation_config.json | jq ".python"`
           operating_systems=`echo "$python_config" | jq '.[] | select(.version=="${{ inputs.python_version }}").operating_systems'`
+
           echo "json={\"os_images\":$(echo $operating_systems)}" >> $GITHUB_OUTPUT
 
   validation:
     name: Validate wheel
     needs: [build_wheel, create_test_config]
-    runs-on: ${{ (contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8' }}
+    runs-on: ${{ inputs.skip_build && 'ubuntu-latest' || ((contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8') }}
     permissions:
       contents: read
       packages: read
@@ -247,8 +294,20 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Log in to GitHub CR
-        if: inputs.environment == ''
+        if: ${{ !inputs.skip_build && inputs.environment == '' }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -256,11 +315,13 @@ jobs:
           password: ${{ github.token }}
 
       - name: Load cuda-quantum wheel
+        if: ${{ !inputs.skip_build }}
         uses: actions/download-artifact@v4
         with:
           name: ${{ needs.build_wheel.outputs.artifact_name }}
 
       - name: Install wheel in clean environment
+        if: ${{ !inputs.skip_build }}
         run: |
           os_image=${{ matrix.os_image }}
           distr=`echo $os_image | cut -d : -f 1 | cut -d / -f 1`
@@ -291,6 +352,7 @@ jobs:
           else echo "Successfully imported cudaq module." >> /tmp/validation.out; fi
 
       - name: Run Python tests
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/run-in-docker
         with:
           image: wheel_validation:local
@@ -317,7 +379,7 @@ jobs:
       # if: matrix.os_image == 'redhat/ubi9:9.2'
       # once conda license issues are resolved
       - name: Run Python MPI tests
-        if: false
+        if: ${{ !inputs.skip_build && false }}
         uses: ./.github/actions/run-in-docker
         with:
           image: wheel_validation:local
@@ -369,6 +431,7 @@ jobs:
             fi
 
       - name: Validate Python snippets in docs
+        if: ${{ !inputs.skip_build }}
         run: |
           docker run --rm -dit --name wheel-validation-snippets wheel_validation:local
           status_sum=0
@@ -392,6 +455,7 @@ jobs:
           fi
 
       - name: Validate Python examples
+        if: ${{ !inputs.skip_build }}
         run: |
           docker run --rm -dit --name wheel-validation-examples wheel_validation:local
           status_sum=0
@@ -416,7 +480,7 @@ jobs:
 
       - name: Create job summary
         id: job_summary
-        if: always() && !cancelled() 
+        if: ${{ !inputs.skip_build && always() && !cancelled() }}
         run: |
           cuda_major=`echo ${{ inputs.cuda_version }} | cut -d . -f1`
           platform_id=`echo "${{ inputs.platform }}" | sed 's/linux\///g' | tr -d ' ' | tr ',' -`
@@ -440,7 +504,7 @@ jobs:
           fi
 
       - name: Upload job summary
-        if: steps.job_summary.outputs.validation_summary != '' && matrix.pip_install_flags == ''
+        if: ${{ !inputs.skip_build && steps.job_summary.outputs.validation_summary != '' && matrix.pip_install_flags == '' }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.job_summary.outputs.artifact_id }}_${{ steps.job_summary.outputs.os_id }}_py${{ inputs.python_version }}_validation
@@ -460,7 +524,20 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Create build info
+        if: ${{ !inputs.skip_build }}
         id: staging
         run: |
           wheeldeps_hash=${{ inputs.devdeps_image }}
@@ -480,6 +557,7 @@ jobs:
           cat .github/workflows/config/gitlab_commits.txt >> "$info_file"
 
       - name: Upload build info
+        if: ${{ !inputs.skip_build }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.staging.outputs.artifact_name }} # changing the artifact name requires updating other workflows
diff --git a/.github/workflows/test_in_devenv.yml b/.github/workflows/test_in_devenv.yml
index 6aa6b771e11..a3342495a89 100644
--- a/.github/workflows/test_in_devenv.yml
+++ b/.github/workflows/test_in_devenv.yml
@@ -21,13 +21,23 @@ on:
       export_environment:
         required: false
         type: boolean
+      skip_build:
+        type: boolean
+        required: false
+        default: false
+        description: >
+          When true the job runs to report success for required-check gating
+          but skips the docker build, tests, and ccache push.
 
 name: Run CI within the dev environment container
 
 jobs:
   build_and_test:
     name: Dev environment (Debug)
-    runs-on: ${{ (contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8' }}
+    # On skip_build (PR tier fake-pass), run on GitHub-hosted ubuntu-latest so the
+    # required check reports success without consuming self-hosted arm64/amd64
+    # capacity or blocking on runner availability.
+    runs-on: ${{ inputs.skip_build && 'ubuntu-latest' || ((contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8') }}
     permissions:
       contents: read
       packages: write
@@ -36,9 +46,21 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Restore environment
         id: restore_devdeps
-        if: inputs.devdeps_image == ''
+        if: ${{ !inputs.skip_build && inputs.devdeps_image == '' }}
         uses: actions/cache/restore@v4
         with:
           path: ${{ inputs.devdeps_archive }}
@@ -46,6 +68,7 @@ jobs:
           fail-on-cache-miss: true
 
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -53,25 +76,30 @@ jobs:
           password: ${{ github.token }}
 
       - name: Set up ccache and ORAS
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-setup
 
       - name: Compute platform ID
         id: platform_meta
+        if: ${{ !inputs.skip_build }}
         run: |
           platform_id=$(echo "${{ inputs.platform }}" | sed 's/linux\///g' | tr -d ' ')
           echo "platform_id=$platform_id" >> $GITHUB_OUTPUT
 
       - name: Restore ccache from GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-restore
         with:
           cache-key: ${{ steps.platform_meta.outputs.platform_id }}-devenv
           registry-token: ${{ github.token }}
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -81,12 +109,14 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
+        if: ${{ !inputs.skip_build }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Build CUDA Quantum
         id: build
+        if: ${{ !inputs.skip_build }}
         run: |
           if ${{ steps.restore_devdeps.outcome != 'skipped' }}; then
             load_output=`docker load --input "${{ inputs.devdeps_archive }}"`
@@ -112,6 +142,7 @@ jobs:
             --target devbuild
 
       - name: Test CUDA Quantum
+        if: ${{ !inputs.skip_build }}
         run: |
           docker buildx build --progress=plain --platform ${{ inputs.platform }} \
             -f docker/build/cudaq.dev.Dockerfile . \
@@ -123,6 +154,7 @@ jobs:
             --target test
 
       - name: Test CUDA Quantum MPI Plugin Activation
+        if: ${{ !inputs.skip_build }}
         run: |
           docker buildx build --progress=plain --platform ${{ inputs.platform }} \
             -f docker/build/cudaq.dev.Dockerfile . \
@@ -136,6 +168,7 @@ jobs:
       - name: Extract ccache from build
         timeout-minutes: 15
         continue-on-error: true
+        if: ${{ !inputs.skip_build }}
         run: |
           mkdir -p /tmp/ccache-export
           docker buildx build \
@@ -154,6 +187,7 @@ jobs:
           fi
 
       - name: Push ccache to GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-push
         with:
           cache-key: ${{ steps.platform_meta.outputs.platform_id }}-devenv
@@ -162,7 +196,7 @@ jobs:
 
       - name: Save environment
         id: env_save
-        if: inputs.export_environment
+        if: ${{ !inputs.skip_build && inputs.export_environment }}
         run: |
           output_directory=/tmp
           platform_id=`echo ${{ inputs.platform }} | sed 's/linux\///g' | tr -d ' '`
@@ -186,7 +220,7 @@ jobs:
 
       - name: Upload environment
         uses: actions/upload-artifact@v4
-        if: inputs.export_environment
+        if: ${{ !inputs.skip_build && inputs.export_environment }}
         with:
           name: ${{ steps.env_save.outputs.filename }}
           path: ${{ steps.env_save.outputs.output_directory }}/${{ steps.env_save.outputs.filename }}.tar
@@ -194,7 +228,7 @@ jobs:
 
   build_and_test_python:
     name: Dev environment (Python)
-    runs-on: ${{ (contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8' }}
+    runs-on: ${{ inputs.skip_build && 'ubuntu-latest' || ((contains(inputs.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8') }}
     permissions:
       contents: read
       packages: write
@@ -203,9 +237,21 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
 
+      - name: Report skip_build
+        if: ${{ inputs.skip_build }}
+        run: |
+          echo "::notice title=SKIPPED on PR tier::This check reports success without running. Re-run the workflow with ci_level=full (or merge to trigger merge_group) to validate this configuration."
+          {
+            echo "## Skipped on PR tier"
+            echo ""
+            echo "This check reported success without running. The full validation matrix runs on **merge_group** and **workflow_dispatch** events."
+            echo ""
+            echo "To run it now: **Re-run jobs** and set \`ci_level=full\`."
+          } >> "$GITHUB_STEP_SUMMARY"
+
       - name: Restore environment
         id: restore_devdeps
-        if: inputs.devdeps_image == ''
+        if: ${{ !inputs.skip_build && inputs.devdeps_image == '' }}
         uses: actions/cache/restore@v4
         with:
           path: ${{ inputs.devdeps_archive }}
@@ -213,6 +259,7 @@ jobs:
           fail-on-cache-miss: true
 
       - name: Log in to GitHub CR
+        if: ${{ !inputs.skip_build }}
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -220,25 +267,30 @@ jobs:
           password: ${{ github.token }}
 
       - name: Set up ccache and ORAS
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-setup
 
       - name: Compute platform ID
         id: platform_meta
+        if: ${{ !inputs.skip_build }}
         run: |
           platform_id=$(echo "${{ inputs.platform }}" | sed 's/linux\///g' | tr -d ' ')
           echo "platform_id=$platform_id" >> $GITHUB_OUTPUT
 
       - name: Restore ccache from GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-restore
         with:
           cache-key: ${{ steps.platform_meta.outputs.platform_id }}-devenv-python
           registry-token: ${{ github.token }}
 
       - name: Set up context for buildx
+        if: ${{ !inputs.skip_build }}
         run: |
           docker context create builder_context
 
       - name: Set up buildx runner
+        if: ${{ !inputs.skip_build }}
         uses: docker/setup-buildx-action@v3
         with:
           endpoint: builder_context
@@ -248,12 +300,14 @@ jobs:
             image=moby/buildkit:v0.19.0
 
       - name: Setup proxy cache
+        if: ${{ !inputs.skip_build }}
         uses: nv-gha-runners/setup-proxy-cache@main
         with:
           enable-apt: true
 
       - name: Set up dev environment
         id: dev_env
+        if: ${{ !inputs.skip_build }}
         run: |
           if ${{ steps.restore_devdeps.outcome != 'skipped' }}; then
             load_output=`docker load --input "${{ inputs.devdeps_archive }}"`
@@ -273,9 +327,11 @@ jobs:
             --build-arg base_image=$base_image
 
       - name: Setup proxy cache
+        if: ${{ !inputs.skip_build }}
         uses: nv-gha-runners/setup-proxy-cache@main
 
       - name: Build and test CUDA Quantum (Python)
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/run-in-docker
         with:
           image: dev_env:local
@@ -338,6 +394,7 @@ jobs:
       - name: Extract ccache from container
         timeout-minutes: 15
         continue-on-error: true
+        if: ${{ !inputs.skip_build }}
         run: |
           # The Python build runs inside a container (via run-in-docker),
           # so ccache data is in the most recently stopped container.
@@ -348,6 +405,7 @@ jobs:
           fi
 
       - name: Push ccache to GHCR
+        if: ${{ !inputs.skip_build }}
         uses: ./.github/actions/ccache-push
         with:
           cache-key: ${{ steps.platform_meta.outputs.platform_id }}-devenv-python
@@ -356,7 +414,7 @@ jobs:
 
       - name: Save environment
         id: env_save
-        if: inputs.export_environment
+        if: ${{ !inputs.skip_build && inputs.export_environment }}
         run: |
           output_directory=/tmp
           platform_id=`echo ${{ inputs.platform }} | sed 's/linux\///g' | tr -d ' '`
@@ -371,7 +429,7 @@ jobs:
 
       - name: Upload environment
         uses: actions/upload-artifact@v4
-        if: inputs.export_environment
+        if: ${{ !inputs.skip_build && inputs.export_environment }}
         with:
           name: ${{ steps.env_save.outputs.filename }}
           path: ${{ steps.env_save.outputs.output_directory }}/${{ steps.env_save.outputs.filename }}.tar

From 4b6bdb9d52ee89d40af49005623a120ee7786ef0 Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Fri, 24 Apr 2026 12:17:02 +0200
Subject: [PATCH 57/85] Migrate cudaq metapackage from setuptools to hatchling
 (#4364)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #3616. Closes https://github.com/astral-sh/uv/issues/12759.

This migrates the `cudaq` metapackage to using `hatchling` instead of
`setuptools`. This is required to solve an issue where `setuptools`
doesn't handle dynamic dependencies properly -- with no clear path to
fixing this bug upstream. See discussion:
https://github.com/pypa/setuptools/issues/5120

This issue causes `cudaq` installations fail on `uv` an increasingly
popular choice of python package manager:
https://github.com/astral-sh/uv/issues/12759.

`setuptools` is now considered "legacy" in Python world and it is
[recommended to upgrade](https://hatch.pypa.io/1.7/why/#build-backend)
to something newer, such as "hatchling".

---

I'm trying to think of creative ways in which changing the build backend
could result in breakages downstream, but I cannot think of anything 😅
anyone any idea what could go wrong?

---------

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 .github/workflows/python_metapackages.yml     |  6 +-
 .licenserc.yaml                               |  1 -
 python/metapackages/MANIFEST.in               |  2 -
 .../metapackages/{setup.py => hatch_build.py} | 79 ++++++++++---------
 python/metapackages/pyproject.toml            | 33 +++++---
 5 files changed, 67 insertions(+), 54 deletions(-)
 delete mode 100644 python/metapackages/MANIFEST.in
 rename python/metapackages/{setup.py => hatch_build.py} (78%)

diff --git a/.github/workflows/python_metapackages.yml b/.github/workflows/python_metapackages.yml
index 97e661d89f9..4ff96371fee 100644
--- a/.github/workflows/python_metapackages.yml
+++ b/.github/workflows/python_metapackages.yml
@@ -92,9 +92,9 @@ jobs:
 
             echo "Building cudaq metapackage ..."
             cd python/metapackages && echo ${{ inputs.cudaq_version }} > _version.txt
-            CUDAQ_META_WHEEL_BUILD=1 python3 -m build . --sdist
+            CUDAQ_META_SDIST_BUILD=1 python3 -m build . --sdist
             mkdir /tmp/packages && mv dist/cudaq-* /tmp/packages/
-            rm -rf cudaq.egg-info dist
+            rm -rf dist
 
             echo "Creating README.md for cuda-quantum package"
             echo "# Welcome to the CUDA-Q Python API" > README.md
@@ -104,7 +104,7 @@ jobs:
             echo "Building cuda-quantum metapackage ..."
             sed -i 's/name = "cudaq"/name = "cuda-quantum"/' pyproject.toml
             echo 'Please remove the cuda-quantum package and `pip install cudaq` instead.' > _deprecated.txt
-            CUDAQ_META_WHEEL_BUILD=1 python3 -m build . --sdist
+            CUDAQ_META_SDIST_BUILD=1 python3 -m build . --sdist
             mv dist/cuda_quantum-* /tmp/packages/
 
             echo "artifact_name=cudaq-metapackage-${{ inputs.cudaq_version }}" >> $GITHUB_OUTPUT
diff --git a/.licenserc.yaml b/.licenserc.yaml
index e5965f32b72..b62e21ec803 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -47,7 +47,6 @@ header:
     - 'include/cudaq/Optimizer/CodeGen/OptUtils.h'
     - 'lib/Optimizer/CodeGen/OptUtils.cpp'
     - 'runtime/cudaq/algorithms/optimizers/nlopt/nlopt-src'
-    - 'python/metapackages/MANIFEST.in'
 
   comment: on-failure
 
diff --git a/python/metapackages/MANIFEST.in b/python/metapackages/MANIFEST.in
deleted file mode 100644
index 6989b1f7978..00000000000
--- a/python/metapackages/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-include pyproject.toml.cu12
-include pyproject.toml.cu13
diff --git a/python/metapackages/setup.py b/python/metapackages/hatch_build.py
similarity index 78%
rename from python/metapackages/setup.py
rename to python/metapackages/hatch_build.py
index 85e98cfbfac..11a50d58f26 100644
--- a/python/metapackages/setup.py
+++ b/python/metapackages/hatch_build.py
@@ -7,11 +7,12 @@
 # ============================================================================ #
 
 import ctypes, os, sys
-import importlib.util
+from pathlib import Path
 import site, glob
-from setuptools import setup
 from typing import Optional
 
+from hatchling.metadata.plugin.interface import MetadataHookInterface
+
 
 def _log(msg: str) -> None:
     sys.stdout.write(f'[cudaq] {msg}\n')
@@ -217,39 +218,41 @@ def _infer_best_package() -> str:
     return cudaq_bdist
 
 
-# This setup handles 3 cases:
-#   1. At the release time, we use it to generate source distribution (which contains
-#      this script).
-#   2. If the source distribution is generated for the deprecated cuda-quantum package
-#      name, this script raises an exception at install time asking to install cudaq.
-#   3. If the source distribution is generated for a valid cudaq version,
-#      this script identifies the installed CUDA version at cudaq install time
-#      and downloads the corresponding CUDA-Q binary distribution.
-# For case 1, `CUDAQ_META_WHEEL_BUILD` is set to 1.
-setup_dir = os.path.dirname(os.path.abspath(__file__))
-data_files = []
-install_requires = []
-if os.environ.get('CUDAQ_META_WHEEL_BUILD', '0') == '1':
-    # Case 1: create source distribution
-    if os.path.exists(os.path.join(setup_dir, "_deprecated.txt")):
-        data_files = [('', [
-            '_deprecated.txt',
-        ])]  # extra files to be copied into the distribution
-else:
-    # Case 2: install cuda-quantum source distribution
-    if os.path.exists(os.path.join(setup_dir, "_deprecated.txt")):
-        with open(os.path.join(setup_dir, "_deprecated.txt"), "r") as f:
-            deprecation = f.read()
-        raise Exception(f'This package is deprecated. \n' + deprecation)
-    # Case 3: install cudaq source distribution
-    with open(os.path.join(setup_dir, "_version.txt"), "r") as f:
-        __version__ = f.read()
-    install_requires = [
-        f"{_infer_best_package()}=={__version__}",
-    ]
-
-setup(
-    zip_safe=False,
-    data_files=data_files,
-    install_requires=install_requires,
-)
+# This hook handles 3 cases:
+#   1. At release time (`CUDAQ_META_SDIST_BUILD=1`), a source distribution
+#      is built (which contains this script).
+#   2. If the `sdist` has been generated for a project marked as deprecated
+#      (`cuda-quantum` package), this script raises an exception at install
+#      time asking to install cudaq.
+#   3. Finally, if the `sdist` has been generated for a valid cudaq version,
+#      this hook identifies the installed CUDA version at cudaq install time
+#      and pins the corresponding CUDA-Q binary distribution.
+class CudaqMetadataHook(MetadataHookInterface):
+    PLUGIN_NAME = "custom"
+
+    def update(self, metadata):
+        curr_dir = Path(__file__).resolve().parent
+        is_sdist_build = os.environ.get('CUDAQ_META_SDIST_BUILD', '0') == '1'
+        deprecated_marker = curr_dir / "_deprecated.txt"
+
+        # Case 1: create source distribution with no dependencies. Make sure
+        # to delete any "dependencies" or "optional-dependencies" entries,
+        # as they override the "Dynamic: dependencies" marker.
+        if is_sdist_build:
+            if "dependencies" in metadata:
+                del metadata["dependencies"]
+            if "optional-dependencies" in metadata:
+                del metadata["optional-dependencies"]
+            return
+
+        # Case 2: a user is installing the deprecated `cuda-quantum` `sdist`.
+        if deprecated_marker.exists():
+            with deprecated_marker.open("r") as f:
+                deprecation = f.read()
+            raise Exception(f'This package is deprecated. \n' + deprecation)
+
+        # Case 3: a user is installing cudaq. Infer dependency based on
+        # installed CUDA version.
+        with (curr_dir / "_version.txt").open("r") as f:
+            version = f.read().strip()
+        metadata["dependencies"] = [f"{_infer_best_package()}=={version}"]
diff --git a/python/metapackages/pyproject.toml b/python/metapackages/pyproject.toml
index 0a43ce9e9fc..b056362b507 100644
--- a/python/metapackages/pyproject.toml
+++ b/python/metapackages/pyproject.toml
@@ -14,8 +14,8 @@ description="Python bindings for the CUDA-Q toolkit for heterogeneous quantum-cl
 authors = [{name = "NVIDIA Corporation & Affiliates"}]
 maintainers = [{name = "NVIDIA Corporation & Affiliates"}]
 readme = { file="README.md.in", content-type = "text/markdown"}
-requires-python = ">=3.10"
-license = { file="LICENSE" }
+requires-python = ">=3.11"
+license-files = ["LICENSE", "NOTICE", "CITATION.cff"]
 classifiers = [
     'Intended Audience :: Science/Research',
     'Intended Audience :: Developers',
@@ -31,6 +31,7 @@ classifiers = [
     'Topic :: Scientific/Engineering',
 ]
 
+
 [project.urls]
 Homepage = "https://developer.nvidia.com/cuda-q"
 Documentation = "https://nvidia.github.io/cuda-quantum"
@@ -42,14 +43,26 @@ chemistry = [ "scipy==1.10.1", "openfermionpyscf==0.5", "h5py==3.12.1"  ]
 visualization = [ "qutip>5" , "matplotlib>=3.5" ]
 
 [build-system]
-requires = ["setuptools", "nvidia-ml-py"]
-build-backend = "setuptools.build_meta"
+requires = ["hatchling>=1.25", "nvidia-ml-py"]
+build-backend = "hatchling.build"
+
+[tool.hatch.version]
+source = "regex"
+path = "_version.txt"
+pattern = "^(?P<version>.+?)\\s*$"
 
-[tool.setuptools]
-packages = []
-include-package-data = true
-license-files = [ "LICENSE", "NOTICE", "CITATION.cff" ]
+[tool.hatch.metadata]
+core-metadata-version = "2.3"
 
-[tool.setuptools.dynamic]
-version = {file = "_version.txt"}
+[tool.hatch.metadata.hooks.custom]
 
+[tool.hatch.build.targets.sdist]
+include = [
+  "_version.txt",
+  "_deprecated.txt",
+]
+
+[tool.hatch.build.targets.wheel]
+include = [
+    "_version.txt",
+]

From c47f54a2dc14965f21e2b122a088168a853a1abb Mon Sep 17 00:00:00 2001
From: TheGupta2012 <harshit.11235@gmail.com>
Date: Fri, 24 Apr 2026 10:49:41 +0000
Subject: [PATCH 58/85] fix: review comments, add combine meas and other
 codegen passes to replace qasm normalization and fix sudoku tests

Signed-off-by: TheGupta2012 <harshit.11235@gmail.com>
---
 .github/workflows/integration_tests.yml       |  43 ++++++-
 CMakeLists.txt                                |   5 +
 lib/Optimizer/CodeGen/Passes.cpp              |  13 --
 .../default/rest/helpers/CMakeLists.txt       |   4 +-
 .../helpers/qbraid/QbraidServerHelper.cpp     | 115 +++++++-----------
 .../default/rest/helpers/qbraid/qbraid.yml    |   9 +-
 unittests/backends/CMakeLists.txt             |   4 +-
 unittests/backends/qbraid/QbraidTester.cpp    |  13 ++
 8 files changed, 120 insertions(+), 86 deletions(-)

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 32d938209fd..7297e2faeba 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -192,7 +192,7 @@ jobs:
         run: |
           # Determine which providers to test based on inputs and event type
           if [[ "${{ github.event_name }}" == "schedule" || "${{ inputs.target }}" == "nightly" ]]; then
-            providers='["anyon", "fermioniq", "infleqtion", "ionq", "iqm", "oqc", "orca", "pasqal", "qci", "quantinuum", "scaleway", "tii"]'
+            providers='["anyon", "fermioniq", "infleqtion", "ionq", "iqm", "oqc", "orca", "pasqal", "qbraid", "qci", "quantinuum", "scaleway", "tii"]'
           else
             # Just run the specified target provider
             providers="[\"${{ inputs.target }}\"]"
@@ -262,6 +262,9 @@ jobs:
               pasqal)
                 filelist="docs/sphinx/targets/cpp/pasqal.cpp docs/sphinx/targets/python/pasqal.py"
                 ;;
+              qbraid)
+                filelist="targettests/qbraid/*.cpp docs/sphinx/targets/cpp/qbraid.cpp docs/sphinx/targets/python/qbraid.py"
+                ;;
               qci)
                 filelist="targettests/qci/*.cpp"
                 ;;
@@ -381,6 +384,11 @@ jobs:
               echo "PASQAL_PROJECT_ID=${{ secrets.PASQAL_PROJECT_ID }}" >> $GITHUB_ENV
               echo "PASQAL_MACHINE_TARGET=EMU_FREE" >> $GITHUB_ENV
               ;;
+            qbraid)
+              echo "### Setting up qBraid account" >> $GITHUB_STEP_SUMMARY
+              echo "::add-mask::${{ secrets.QBRAID_API_KEY }}"
+              echo "QBRAID_API_KEY=${{ secrets.QBRAID_API_KEY }}" >> $GITHUB_ENV
+              ;;
             qci)
               echo "### Setting up QCI account" >> $GITHUB_STEP_SUMMARY
               echo "::add-mask::${{ secrets.QCI_AUTH_TOKEN }}"
@@ -672,6 +680,39 @@ jobs:
                 fi
                 ;;
 
+              qbraid)
+                if [[ "$filename" == *.cpp ]]; then
+                  nvq++ -v $filename --target qbraid --qbraid-machine qbraid:qbraid:sim:qir-sv
+                  test_status=$?
+                  if [ $test_status -eq 0 ]; then
+                    ./a.out
+                    test_status=$?
+                    if [ $test_status -eq 0 ]; then
+                      echo ":white_check_mark: Successfully ran test: $filename" >> $GITHUB_STEP_SUMMARY
+                    else
+                      echo ":x: Test failed (failed to execute): $filename" >> $GITHUB_STEP_SUMMARY
+                      test_err_sum=$((test_err_sum+1))
+                    fi
+                  else
+                    echo ":x: Test failed (failed to compile): $filename" >> $GITHUB_STEP_SUMMARY
+                    test_err_sum=$((test_err_sum+1))
+                  fi
+                elif [[ "$filename" == *.py ]]; then
+                  python3 $filename 1> /dev/null
+                  test_status=$?
+                  if [ $test_status -eq 0 ]; then
+                    echo ":white_check_mark: Successfully ran test: $filename" >> $GITHUB_STEP_SUMMARY
+                  else
+                    echo ":x: Test failed (failed to execute): $filename" >> $GITHUB_STEP_SUMMARY
+                    test_err_sum=$((test_err_sum+1))
+                  fi
+                else
+                  echo "::warning::Unsupported file type: $filename"
+                  echo ":warning: Test skipped (unsupported file type): $filename" >> $GITHUB_STEP_SUMMARY
+                  test_skip_sum=$((test_skip_sum+1))
+                fi
+                ;;
+
               qci)
                 nvq++ -v $filename --target qci
                 test_status=$?
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04a1c07db34..0baf807653e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,6 +135,11 @@ if (NOT DEFINED CUDAQ_ENABLE_SCALEWAY_BACKEND)
   set(CUDAQ_ENABLE_SCALEWAY_BACKEND ON CACHE BOOL "Enable building the Scaleway target.")
 endif()
 
+# Enable qBraid target by default.
+if (NOT DEFINED CUDAQ_ENABLE_QBRAID_BACKEND)
+  set(CUDAQ_ENABLE_QBRAID_BACKEND ON CACHE BOOL "Enable building the qBraid target.")
+endif()
+
 # Generate a CompilationDatabase (compile_commands.json file) for our build,
 # for use by clang_complete, YouCompleteMe, etc.
 set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp
index 808db9e3e2a..8ff6c53c2d1 100644
--- a/lib/Optimizer/CodeGen/Passes.cpp
+++ b/lib/Optimizer/CodeGen/Passes.cpp
@@ -100,16 +100,6 @@ static void addFermioniqPipeline(OpPassManager &pm) {
   pm.addPass(createBasisConversion(options));
 }
 
-static void addQbraidPipeline(OpPassManager &pm) {
-  using namespace cudaq::opt;
-  std::string basis[] = {
-      "h", "s", "t", "rx", "ry", "rz", "x", "y", "z", "x(1)",
-  };
-  BasisConversionOptions options;
-  options.basis = basis;
-  pm.addPass(createBasisConversion(options));
-}
-
 void cudaq::opt::registerTargetPipelines() {
   PassPipelineRegistration<>("anyon-cgate-set-mapping",
                              "Convert kernels to Anyon gate set.",
@@ -135,9 +125,6 @@ void cudaq::opt::registerTargetPipelines() {
   PassPipelineRegistration<>("fermioniq-gate-set-mapping",
                              "Convert kernels to Fermioniq gate set.",
                              addFermioniqPipeline);
-  PassPipelineRegistration<>("qbraid-gate-set-mapping",
-                             "Convert kernels to qBraid gate set.",
-                             addQbraidPipeline);
 }
 
 void cudaq::opt::registerCodeGenDialect(DialectRegistry &registry) {
diff --git a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
index 55fff380909..4574b6ba8fe 100644
--- a/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
+++ b/runtime/cudaq/platform/default/rest/helpers/CMakeLists.txt
@@ -27,4 +27,6 @@ endif()
 if(CUDAQ_ENABLE_TII_BACKEND)
   add_subdirectory(tii)
 endif()
-add_subdirectory(qbraid)
+if(CUDAQ_ENABLE_QBRAID_BACKEND)
+  add_subdirectory(qbraid)
+endif()
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
index d1e005ba6ef..5996d1ffe64 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/QbraidServerHelper.cpp
@@ -11,7 +11,6 @@
 #include "cudaq/Support/Version.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq/utils/cudaq_utils.h"
-#include <map>
 #include <regex>
 #include <thread>
 
@@ -103,7 +102,7 @@ class QbraidServerHelper : public ServerHelper {
       // v2 API: program is a structured object with format and data
       nlohmann::json program;
       program["format"] = "qasm2";
-      program["data"] = normalizeClassicalRegisters(circuitCode.code);
+      program["data"] = circuitCode.code;
       job["program"] = program;
 
       // v2 API: name is a top-level field (not nested under tags)
@@ -236,8 +235,43 @@ class QbraidServerHelper : public ServerHelper {
                     : static_cast<std::size_t>(count);
           }
 
+          // The returned bitstring spans every measured qubit, including
+          // compiler-generated ancillae that the user never declared. Reduce
+          // it down to the user-visible qubits using the output_names entry
+          // populated by the framework (Executor.cpp writes one per submitted
+          // circuit; Future.cpp re-initializes the helper with that config
+          // before processResults runs). Mirrors the IonQ / Braket helpers.
+          cudaq::ExecutionResult fullExecResults{counts};
+          auto fullSampleResults = cudaq::sample_result{fullExecResults};
+
           std::vector<ExecutionResult> execResults;
-          execResults.emplace_back(ExecutionResult{counts});
+
+          auto outputNamesIt = outputNames.find(jobId);
+          if (outputNamesIt != outputNames.end() &&
+              !outputNamesIt->second.empty()) {
+            auto &job_output_names = outputNamesIt->second;
+
+            std::vector<std::size_t> qubitNumbers;
+            qubitNumbers.reserve(job_output_names.size());
+            for (auto &[result, info] : job_output_names)
+              qubitNumbers.push_back(info.qubitNum);
+
+            auto subset = fullSampleResults.get_marginal(qubitNumbers);
+            execResults.emplace_back(ExecutionResult{subset.to_map()});
+
+            // Emit one single-bit register per named result so that
+            // `sample_result::to_map(registerName)` still works.
+            for (const auto &[result, info] : job_output_names) {
+              CountsDictionary regCounts;
+              for (const auto &[bits, count] : fullSampleResults)
+                regCounts[std::string{bits[info.qubitNum]}] += count;
+              execResults.emplace_back(regCounts, info.registerName);
+            }
+          } else {
+            // No output_names available: fall back to the full flat counts.
+            execResults.emplace_back(ExecutionResult{counts});
+          }
+
           return cudaq::sample_result(execResults);
         }
 
@@ -256,7 +290,7 @@ class QbraidServerHelper : public ServerHelper {
         // (see runtime/common/RestClient.cpp) with a fixed message format:
         //   "HTTP <VERB> Error - status code <code>: <curl_err>: <body>"
         // The code isn't exposed as a structured attribute, so we parse it
-        // out to distinguish terminal client errors (401/403/404) from
+        // out to distinguish terminal client errors (401/403/404/409) from
         // transient server/network errors (5xx, parse errors) that retry.
         static const std::regex statusRx(R"(status code (\d+))");
         const std::string what = e.what();
@@ -280,6 +314,15 @@ class QbraidServerHelper : public ServerHelper {
               "qBraid result not found (HTTP 404) for job " + jobId +
               ". The job may have been deleted or never produced results.");
 
+        // Terminal: job reached a non-success terminal state (FAILED or
+        // CANCELLED). qBraid v2 returns 409 Conflict on /result in that case
+        // because no measurement data will ever be produced.
+        if (statusCode == 409)
+          throw std::runtime_error(
+              "qBraid job " + jobId +
+              " did not produce results (HTTP 409). The job likely FAILED "
+              "or was CANCELLED.");
+
         // Retryable: 5xx, network errors, JSON parse failures, etc.
         cudaq::info("Exception when fetching results (attempt {}/{}): {}",
                     attempt + 1, maxRetries, what);
@@ -304,70 +347,6 @@ class QbraidServerHelper : public ServerHelper {
   }
 
 private:
-  /// @brief Merges multiple single-bit classical registers emitted by nvq++'s
-  /// QASM 2 codegen into a single multi-bit `creg c[N]`.
-  std::string normalizeClassicalRegisters(const std::string &qasm) const {
-    // Required to unblock qBraid-routed hardware backends. nvq++ emits one
-    // `creg varK[1];` per measurement. AWS Braket's classical simulators
-    // (SV1, DM1, TN1) tolerate that via lenient register concatenation, but
-    // stricter hardware transpilers below reject it:
-    //   - IQM (Garnet etc.): returns only the first register -> 1-bit results
-    //   - Rigetti: collapses all registers onto b[0] -> "bit already in use"
-    //   - IonQ-via-Braket: similar strict behavior
-    // Normalizing to a single register is the canonical QASM 2 form and is
-    // accepted uniformly by every qBraid-reachable backend.
-    static const std::regex cregDeclRx(R"(creg\s+(\w+)\s*\[\s*(\d+)\s*\]\s*;)");
-
-    std::vector<std::pair<std::string, int>> cregs;
-    for (auto it = std::sregex_iterator(qasm.begin(), qasm.end(), cregDeclRx);
-         it != std::sregex_iterator(); ++it) {
-      cregs.emplace_back((*it)[1].str(), std::stoi((*it)[2].str()));
-    }
-
-    // Nothing to do if the QASM already has a single classical register.
-    if (cregs.size() <= 1)
-      return qasm;
-
-    std::map<std::string, int> offsetByName;
-    int totalBits = 0;
-    for (auto &[name, size] : cregs) {
-      offsetByName[name] = totalBits;
-      totalBits += size;
-    }
-
-    std::string out = qasm;
-
-    // Rewrite every `-> NAME[i]` target BEFORE we mutate the creg declarations.
-    for (auto &[name, size] : cregs) {
-      int base = offsetByName[name];
-      for (int i = 0; i < size; ++i) {
-        std::regex measureTargetRx("->\\s*" + name + "\\s*\\[\\s*" +
-                                   std::to_string(i) + "\\s*\\]");
-        out = std::regex_replace(out, measureTargetRx,
-                                 "-> qbraid__creg__[" +
-                                     std::to_string(base + i) + "]");
-      }
-    }
-
-    // Replace the first declaration with the merged register.
-    out = std::regex_replace(out, cregDeclRx,
-                             "creg qbraid__creg__[" +
-                                 std::to_string(totalBits) + "];",
-                             std::regex_constants::format_first_only);
-
-    // Remove the remaining original declarations.
-    for (size_t i = 1; i < cregs.size(); ++i) {
-      std::regex toRemove("creg\\s+" + cregs[i].first +
-                          "\\s*\\[\\s*\\d+\\s*\\]\\s*;\\s*");
-      out = std::regex_replace(out, toRemove, "");
-    }
-
-    cudaq::info(
-        "Normalized {} classical registers into single qbraid__creg__[{}]",
-        cregs.size(), totalBits);
-    return out;
-  }
-
   /// @brief Returns the headers for the server requests.
   RestHeaders getHeaders() override {
     if (backendConfig.find("api_key") == backendConfig.end()) {
diff --git a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
index da3f92dc94d..2b83c3ea7ff 100644
--- a/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/qbraid/qbraid.yml
@@ -15,8 +15,13 @@ config:
   gen-target-backend: true
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
-  # Define the JIT lowering pipeline
-  jit-mid-level-pipeline: "qbraid-gate-set-mapping"
+  # Define the JIT lowering pipeline. Mirrors Braket so that
+  # `combine-measurements` runs and attaches the QIROutputNamesAttrName
+  # metadata nvq++ needs to thread user-visible qubit indices through to
+  # `outputNames[jobId]` in the helper. Without that pass the default register
+  # would contain compiler-generated ancillae and named sub-registers would be
+  # unreachable (see QbraidServerHelper::processResults).
+  jit-mid-level-pipeline: "lower-to-cfg,decomposition{basis=h,s,t,rx,ry,rz,x,y,z,x(1)},quake-to-cc-prep,func.func(expand-control-veqs,combine-quantum-alloc,canonicalize,combine-measurements)"
   # Tell the rest-qpu that we are generating OpenQASM.
   codegen-emission: qasm2
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/unittests/backends/CMakeLists.txt b/unittests/backends/CMakeLists.txt
index 130fc4ecb0b..627ae6a7395 100644
--- a/unittests/backends/CMakeLists.txt
+++ b/unittests/backends/CMakeLists.txt
@@ -97,7 +97,9 @@ if (OPENSSL_FOUND AND CUDAQ_ENABLE_PYTHON AND CUDAQ_TEST_MOCK_SERVERS)
   if (CUDAQ_ENABLE_SCALEWAY_BACKEND)
     add_subdirectory(scaleway)
   endif()
-  add_subdirectory(qbraid)
+  if (CUDAQ_ENABLE_QBRAID_BACKEND)
+    add_subdirectory(qbraid)
+  endif()
   add_subdirectory(extra_payload_provider)
   add_subdirectory(quake_backend)
 endif()
diff --git a/unittests/backends/qbraid/QbraidTester.cpp b/unittests/backends/qbraid/QbraidTester.cpp
index ff59dbdd728..a73b0dc6da3 100644
--- a/unittests/backends/qbraid/QbraidTester.cpp
+++ b/unittests/backends/qbraid/QbraidTester.cpp
@@ -276,6 +276,19 @@ CUDAQ_TEST(QbraidTester, checkResultNotFound) {
                                 "result not found"));
 }
 
+// 409 on /result -> terminal. qBraid v2 returns this when the job reached a
+// non-success terminal state (FAILED or CANCELLED), so results will never
+// appear and the helper must fail fast instead of burning the retry budget.
+CUDAQ_TEST(QbraidTester, checkResultConflict) {
+  armResultStatus(409);
+  auto kernel = cudaq::make_kernel();
+  auto qubit = kernel.qalloc(2);
+  kernel.h(qubit[0]);
+  kernel.mz(qubit[0]);
+  EXPECT_TRUE(throwsWithMessage([&]() { (void)cudaq::sample(kernel); },
+                                "did not produce results"));
+}
+
 // 500 on /result -> retryable. Force hook fires once then clears, so the
 // second attempt succeeds. Sampling must not throw.
 CUDAQ_TEST(QbraidTester, checkResultServerErrorRetries) {

From 13eff8f4fc0ad4b1801109aabf9892b4a556f6e5 Mon Sep 17 00:00:00 2001
From: Thomas Alexander <thomasalexander2718@gmail.com>
Date: Fri, 24 Apr 2026 14:59:59 -0400
Subject: [PATCH 59/85] [PTSBE] Expose noise channel and parameters on
 TraceInstruction (#4393)

When calling `cudaq.ptsbe.sample(..., return_execution_data=True)`, the
TraceInstruction objects returned for Noise-type instructions now carry
the channel's numeric parameters and the full kraus_channel object.

Previously a Noise TraceInstruction had an empty `params` list, and its
underlying `cudaq::kraus_channel` (populated in C++) was not bound to
Python. Users could see which channel fired via `inst.name` and which
Kraus operator was selected via
`kraus_selections[i].kraus_operator_index`, but could not recover the
channel probability or the Kraus matrices from the trace without
re-inspecting the NoiseModel.

Signed-off-by: Thomas Alexander <talexander@nvidia.com>
---
 docs/sphinx/applications/python/ptsbe.ipynb   |  9 +++++++--
 .../examples/ptsbe/inspect_execution_data.py  | 10 +++++++++-
 .../cudaq/algorithms/py_sample_ptsbe.cpp      |  6 ++++++
 python/tests/ptsbe/test_execution_data.py     | 20 +++++++++++++++++++
 runtime/cudaq/ptsbe/PTSBESample.cpp           |  6 ++++--
 5 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/docs/sphinx/applications/python/ptsbe.ipynb b/docs/sphinx/applications/python/ptsbe.ipynb
index a11dd3ee715..101092f9ab8 100644
--- a/docs/sphinx/applications/python/ptsbe.ipynb
+++ b/docs/sphinx/applications/python/ptsbe.ipynb
@@ -345,10 +345,15 @@
         "                      if inst.type == cudaq.ptsbe.TraceInstructionType.Noise}\n",
         "\n",
         "def fmt_selection(sel):\n",
+        "    # `channel.params` carries the noise rates passed to the channel, and\n",
+        "    # `channel.channel` is the underlying `cudaq.KrausChannel` with `.noise_type`,\n",
+        "    # `.parameters`, and `.get_ops()` for the Kraus matrices.\n",
         "    channel = noise_instructions[sel.circuit_location]\n",
         "    label = \"error\" if sel.is_error else \"no-error\"\n",
-        "    return (f\"    site {sel.circuit_location} [{channel.name} on q{channel.targets}]: \"\n",
-        "            f\"K{sel.kraus_operator_index} ({label})\")\n",
+        "    params = \", \".join(f\"{p:g}\" for p in channel.params)\n",
+        "    n_ops = len(channel.channel.get_ops())\n",
+        "    return (f\"    site {sel.circuit_location} [{channel.name}(p={params}) on q{channel.targets}]: \"\n",
+        "            f\"K{sel.kraus_operator_index} / {n_ops} ops ({label})\")\n",
         "\n",
         "highest = trajs[0]\n",
         "print(f\"\\nHighest-probability trajectory (id={highest.trajectory_id}):\")\n",
diff --git a/docs/sphinx/snippets/python/using/examples/ptsbe/inspect_execution_data.py b/docs/sphinx/snippets/python/using/examples/ptsbe/inspect_execution_data.py
index 43f422fa113..20f649204bd 100644
--- a/docs/sphinx/snippets/python/using/examples/ptsbe/inspect_execution_data.py
+++ b/docs/sphinx/snippets/python/using/examples/ptsbe/inspect_execution_data.py
@@ -19,9 +19,17 @@
 
 data = result.ptsbe_execution_data
 
-# Circuit structure
+# Circuit structure. For Noise instructions, ``inst.params`` carries the
+# channel's numeric parameters and ``inst.channel`` is a ``cudaq.KrausChannel``
+# exposing ``.noise_type``, ``.parameters``, and ``.get_ops()``. For Gate and
+# Measurement instructions ``inst.channel`` is ``None``.
+Noise = cudaq.ptsbe.TraceInstructionType.Noise
 for inst in data.instructions:
     print(inst.type, inst.name, inst.targets)
+    if inst.type == Noise:
+        print(f"  params={list(inst.params)}  "
+              f"noise_type={inst.channel.noise_type}  "
+              f"num_kraus_ops={len(inst.channel.get_ops())}")
 
 # Trajectory details
 for trajectory in data.trajectories:
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
index 064672787bc..de629311901 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
@@ -253,6 +253,12 @@ void cudaq::bindSamplePTSBE(nanobind::module_ &mod) {
                      return std::vector<double>(self.params.begin(),
                                                 self.params.end());
                    })
+      .def_prop_ro("channel",
+                   [](const ptsbe::TraceInstruction &self) -> nanobind::object {
+                     if (!self.channel)
+                       return nanobind::none();
+                     return nanobind::cast(*self.channel);
+                   })
       .def("__repr__", [](const ptsbe::TraceInstruction &self) {
         return "TraceInstruction(" + self.name + " on " +
                std::to_string(self.targets.size()) + " qubits)";
diff --git a/python/tests/ptsbe/test_execution_data.py b/python/tests/ptsbe/test_execution_data.py
index bd884ddfdbd..f2297276f22 100644
--- a/python/tests/ptsbe/test_execution_data.py
+++ b/python/tests/ptsbe/test_execution_data.py
@@ -147,6 +147,26 @@ def x_kernel():
     assert data.count_instructions(Measurement) >= 1
 
 
+def test_noise_instruction_exposes_params_and_channel(bell_kernel):
+    noise = cudaq.NoiseModel()
+    noise.add_channel("h", [0], cudaq.DepolarizationChannel(0.05))
+    result = cudaq.ptsbe.sample(
+        bell_kernel,
+        noise_model=noise,
+        shots_count=100,
+        return_execution_data=True,
+    )
+    data = result.ptsbe_execution_data
+    Noise = cudaq.ptsbe.TraceInstructionType.Noise
+    noises = [i for i in data.instructions if i.type == Noise]
+    assert len(noises) == 1
+    (inst,) = noises
+    assert inst.params == [0.05]
+    assert inst.channel is not None
+    assert inst.channel.noise_type == cudaq.NoiseModelType.DepolarizationChannel
+    assert len(inst.channel.get_ops()) == 4
+
+
 def test_execution_data_includes_apply_noise(kernel_with_apply_noise):
     result = cudaq.ptsbe.sample(
         kernel_with_apply_noise,
diff --git a/runtime/cudaq/ptsbe/PTSBESample.cpp b/runtime/cudaq/ptsbe/PTSBESample.cpp
index 4d1d4abe05c..3708368c4fd 100644
--- a/runtime/cudaq/ptsbe/PTSBESample.cpp
+++ b/runtime/cudaq/ptsbe/PTSBESample.cpp
@@ -128,11 +128,12 @@ static void convertTraceInstruction(const cudaq::Trace::Instruction &inst,
         continue;
       if (!channel.is_unitary_mixture())
         channel.generateUnitaryParameters();
+      auto parameters = channel.parameters;
       result.push_back({TraceInstructionType::Noise,
                         channel.get_type_name(),
                         noiseQubits,
                         {},
-                        {},
+                        std::move(parameters),
                         std::move(channel)});
     }
     return;
@@ -151,11 +152,12 @@ static void convertTraceInstruction(const cudaq::Trace::Instruction &inst,
         continue;
       if (!channel.is_unitary_mixture())
         channel.generateUnitaryParameters();
+      auto parameters = channel.parameters;
       result.push_back({TraceInstructionType::Noise,
                         channel.get_type_name(),
                         targets,
                         {},
-                        {},
+                        std::move(parameters),
                         std::move(channel)});
     }
     return;

From 90caa23b42252eb1a35ea8f2c221687de8c9c93d Mon Sep 17 00:00:00 2001
From: efratshabtai <efratshabtai@users.noreply.github.com>
Date: Fri, 24 Apr 2026 12:48:01 -0700
Subject: [PATCH 60/85] Add application hub launchable link to Applications
 page (#4231)

Signed-off-by: efratshabtai <efratshabtai@users.noreply.github.com>
Co-authored-by: Sachin Pisal <spisal@nvidia.com>
---
 docs/sphinx/using/applications.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/sphinx/using/applications.rst b/docs/sphinx/using/applications.rst
index 5704bf8ba74..6bca1cbe082 100644
--- a/docs/sphinx/using/applications.rst
+++ b/docs/sphinx/using/applications.rst
@@ -2,6 +2,7 @@ CUDA-Q Applications
 ====================
 
 This page contains a number of different applications implemented using CUDA-Q. All notebooks can be found `here. <https://github.com/NVIDIA/cuda-quantum/tree/main/docs/sphinx/applications/python>`_
+To run these applications without a local installation, run the `CUDA-Q application Hub launchable <https://www.nvidia.com/cudaq-apps/>`_.
 
 
 .. when adding applications

From 10997834df2fe7d06a7ba23d791ba0a7a10f28e9 Mon Sep 17 00:00:00 2001
From: Renaud Kauffmann <rkauffmann@nvidia.com>
Date: Fri, 24 Apr 2026 13:34:52 -0700
Subject: [PATCH 61/85] Replacing llvm type registry with our own (#4385)

While exploring and prototyping options for compile-time checks of QPUs,
I keep running into the issue that llvm headers bleed into user code. I
would like to put an end to this issue once and for all by replacing it
with our own registry.

This registry is widely borrowed from LLVM by Claude and reviewed by
codex. I have asked it to keep the current model, whereby it is
instantiated in one place so that a registration from a shared lib is
visible to all shared libs.

I believe, from discussions with Bruno, that this should also help the
LLVM update work.

---------

Signed-off-by: Renaud Kauffmann <rkauffmann@nvidia.com>
---
 .../DecompositionPatternSelection.cpp         |   2 +-
 .../Transforms/DecompositionPatterns.cpp      |   9 +-
 .../Transforms/DecompositionPatterns.h        |  11 +-
 runtime/common/Executor.cpp                   |   2 +-
 runtime/common/Registry.h                     | 177 ++++++++++++++++--
 runtime/common/ServerHelper.cpp               |   2 +-
 runtime/cudaq/domains/chemistry/molecule.cpp  |   2 +-
 .../rest_server/RemoteRuntimeClient.cpp       |   2 +-
 .../rest_server/RemoteRuntimeServer.cpp       |   2 +-
 .../cudaq/platform/mqpu/MultiQPUPlatform.cpp  |   2 +-
 runtime/cudaq/platform/qpu.cpp                |   2 +-
 runtime/cudaq/platform/quantum_platform.cpp   |   2 +-
 .../Optimizer/DecompositionPatternsTest.cpp   |  28 ++-
 13 files changed, 193 insertions(+), 50 deletions(-)

diff --git a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
index 9fac90636bf..bde2025353e 100644
--- a/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatternSelection.cpp
@@ -183,7 +183,7 @@ class DecompositionGraph {
   static DecompositionGraph fromRegistry() {
     llvm::StringMap<std::unique_ptr<cudaq::DecompositionPatternType>> patterns;
     for (const auto &patternType :
-         cudaq::DecompositionPatternType::RegistryType::entries()) {
+         cudaq::DecompositionPatternTypeRegistry::entries()) {
       patterns.insert({patternType.getName(), patternType.instantiate()});
     }
     return DecompositionGraph(std::move(patterns));
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.cpp b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
index 755ab75af38..823ac8623c4 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.cpp
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.cpp
@@ -39,7 +39,7 @@
 
 using namespace mlir;
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::DecompositionPatternType::RegistryType)
+LLVM_INSTANTIATE_REGISTRY(cudaq::DecompositionPatternTypeRegistry)
 
 namespace {
 
@@ -309,6 +309,8 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
 }
 
 // From here on, we define the decomposition patterns ==========================
+#define CONCAT(a, b) CONCAT_INNER(a, b)
+#define CONCAT_INNER(a, b) a##b
 
 /// Macro to register a decomposition pattern with its metadata
 /// Usage: REGISTER_DECOMPOSITION_PATTERN(PatternName, "source_op", "target1",
@@ -332,7 +334,8 @@ LogicalResult checkAndExtractControls(quake::OperatorInterface op,
       return pattern;                                                          \
     }                                                                          \
   };                                                                           \
-  CUDAQ_REGISTER_TYPE(cudaq::DecompositionPatternType, PATTERN##Type, PATTERN)
+  static cudaq::DecompositionPatternTypeRegistry::Add<PATTERN##Type> CONCAT(   \
+      TEMPNAME_, PATTERN)(#PATTERN, "");
 
 // NOTE: The patterns SToR1, TToR1, R1ToU3, and U3ToRotations handle arbitrary
 // control counts and are registered with (n) metadata. R1ToRz explicitly
@@ -1831,7 +1834,7 @@ void cudaq::populateWithAllDecompositionPatterns(
         std::map<std::string, std::unique_ptr<cudaq::DecompositionPatternType>>
             map;
         for (auto &patternType :
-             cudaq::DecompositionPatternType::RegistryType::entries()) {
+             cudaq::DecompositionPatternTypeRegistry::entries()) {
           map[patternType.getName().str()] = patternType.instantiate();
         }
         return map;
diff --git a/lib/Optimizer/Transforms/DecompositionPatterns.h b/lib/Optimizer/Transforms/DecompositionPatterns.h
index 82ed96a3e33..20b402abd5e 100644
--- a/lib/Optimizer/Transforms/DecompositionPatterns.h
+++ b/lib/Optimizer/Transforms/DecompositionPatterns.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include "common/Registry.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Registry.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include <string>
@@ -28,12 +28,7 @@ namespace cudaq {
 /// system. Stores the pattern metadata and provides a factory method to create
 /// new instances of the pattern.
 ///
-/// Register decomposition patterns using
-/// CUDAQ_REGISTER_TYPE(cudaq::DecompositionPatternType, MyPatternType,
-/// pattern_name)
-/// where pattern_name is the same as MyPatternType().getPatternName().
-class DecompositionPatternType
-    : public registry::RegisteredType<DecompositionPatternType> {
+class DecompositionPatternType {
 public:
   virtual ~DecompositionPatternType() = default;
 
@@ -104,4 +99,6 @@ std::unique_ptr<mlir::ConversionTarget>
 createBasisTarget(mlir::MLIRContext &context,
                   mlir::ArrayRef<std::string> targetBasis);
 
+using DecompositionPatternTypeRegistry =
+    llvm::Registry<DecompositionPatternType>;
 } // namespace cudaq
diff --git a/runtime/common/Executor.cpp b/runtime/common/Executor.cpp
index 760188aae02..e1fff8102e7 100644
--- a/runtime/common/Executor.cpp
+++ b/runtime/common/Executor.cpp
@@ -59,4 +59,4 @@ details::future Executor::execute(std::vector<KernelExecution> &codesToExecute,
 }
 } // namespace cudaq
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::Executor::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::Executor::RegistryType)
diff --git a/runtime/common/Registry.h b/runtime/common/Registry.h
index c9238cccc0e..27e5fcfad68 100644
--- a/runtime/common/Registry.h
+++ b/runtime/common/Registry.h
@@ -7,41 +7,154 @@
  ******************************************************************************/
 
 #pragma once
+
+// Suppress the LLVM ABI breaking check for translation units that include this
+// header. Several server helper DSOs transitively include LLVM headers (e.g.
+// llvm/Support/Base64.h) but do not link against LLVM libraries, so the ABI
+// check symbol would be unresolved. This define must be set before any LLVM
+// header is included. TODO: remove once those DSOs stop including LLVM headers.
 #define LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING 1
-#include "llvm/Support/Registry.h"
+
 #include <memory>
+#include <string>
 
 namespace cudaq {
+
+/// A registry entry: name + factory function that constructs a
+/// std::unique_ptr<T>. Modeled after llvm::SimpleRegistryEntry.
+template <typename T>
+class RegistryEntry {
+  const char *Name;
+  std::unique_ptr<T> (*Ctor)();
+
+public:
+  RegistryEntry(const char *N, std::unique_ptr<T> (*C)()) : Name(N), Ctor(C) {}
+
+  const char *getName() const { return Name; }
+  std::unique_ptr<T> instantiate() const { return Ctor(); }
+};
+
+/// A global type registry used with static constructors to make pluggable
+/// components "just work" when linked with an executable or shared library.
+///
+/// This is a drop-in replacement for llvm::Registry<T> that has no LLVM
+/// dependencies. The cross-shared-library mechanism is the same: Head, Tail,
+/// and add_node are defined in exactly one translation unit per registry type
+/// via CUDAQ_INSTANTIATE_REGISTRY, so all DSOs that link against that TU
+/// share a single list.
+template <typename T>
+class Registry {
+public:
+  using type = T;
+  using entry = RegistryEntry<T>;
+
+  class node;
+  class iterator;
+
+private:
+  Registry() = delete;
+
+  friend class node;
+  static node *Head, *Tail;
+
+public:
+  /// Node in the singly-linked list of entries.
+  class node {
+    friend class iterator;
+    friend class Registry<T>;
+
+    node *Next;
+    const entry &Val;
+
+  public:
+    node(const entry &V) : Next(nullptr), Val(V) {}
+  };
+
+  /// Append a node to the list. Defined out-of-line by
+  /// CUDAQ_INSTANTIATE_REGISTRY so that the single definition lives in
+  /// the DSO that owns this registry.
+  static void add_node(node *N);
+
+  /// Forward iterator over registry entries.
+  class iterator {
+    const node *Cur;
+
+  public:
+    using value_type = const entry;
+    using reference = const entry &;
+    using pointer = const entry *;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::forward_iterator_tag;
+
+    explicit iterator(const node *N) : Cur(N) {}
+
+    bool operator==(const iterator &That) const { return Cur == That.Cur; }
+    bool operator!=(const iterator &That) const { return Cur != That.Cur; }
+    iterator &operator++() {
+      Cur = Cur->Next;
+      return *this;
+    }
+    const entry &operator*() const { return Cur->Val; }
+    const entry *operator->() const { return &Cur->Val; }
+  };
+
+  /// begin() is defined by CUDAQ_INSTANTIATE_REGISTRY (same pattern as LLVM).
+  static iterator begin();
+  static iterator end() { return iterator(nullptr); }
+
+  /// Lightweight range for range-based for loops.
+  struct range {
+    iterator b, e;
+    iterator begin() const { return b; }
+    iterator end() const { return e; }
+  };
+  static range entries() { return {begin(), end()}; }
+
+  /// Static registration helper. Constructed as a global/static object to
+  /// register a concrete subtype V under a string name:
+  ///
+  ///   static Registry<Base>::Add<Derived> reg("name");
+  template <typename V>
+  class Add {
+    entry Entry;
+    node Node;
+
+    static std::unique_ptr<T> CtorFn() { return std::make_unique<V>(); }
+
+  public:
+    Add(const char *Name) : Entry(Name, CtorFn), Node(Entry) {
+      add_node(&Node);
+    }
+  };
+};
+
 namespace registry {
 
-/// @brief RegisteredType allows interface types to declare themselves
-/// as plugin interfaces. Used as follows
-/// class MyInterface : public RegisteredType<MyInterface> {...};
+/// Mixin base class: inherit from this to declare T as a register-able type.
+///   class QPU : public RegisteredType<QPU> { ... };
 template <typename T>
 class RegisteredType {
 public:
-  using RegistryType = llvm::Registry<T>;
+  using RegistryType = ::cudaq::Registry<T>;
 };
 
-/// @brief Retrieve a plugin sub-type of the given template type by name.
+/// Retrieve a registered subtype by name.
 template <typename T>
 std::unique_ptr<T> get(const std::string &name) {
-  for (typename T::RegistryType::iterator it = T::RegistryType::begin(),
-                                          ie = T::RegistryType::end();
+  for (auto it = T::RegistryType::begin(), ie = T::RegistryType::end();
        it != ie; ++it) {
-    if (name == it->getName().str())
+    if (name == it->getName())
       return it->instantiate();
   }
   return nullptr;
 }
 
-/// @brief Return true if the plugin with given name and type is available.
+/// Return true if a subtype with the given name is registered.
 template <typename T>
 bool isRegistered(const std::string &name) {
-  for (typename T::RegistryType::iterator it = T::RegistryType::begin(),
-                                          ie = T::RegistryType::end();
+  for (auto it = T::RegistryType::begin(), ie = T::RegistryType::end();
        it != ie; ++it) {
-    if (name == it->getName().str())
+    if (name == it->getName())
       return true;
   }
   return false;
@@ -50,8 +163,44 @@ bool isRegistered(const std::string &name) {
 } // namespace registry
 } // namespace cudaq
 
+/// Instantiate a registry for a single type. Place this in exactly one `.cpp`
+/// per registry type in the DSO that should own the list.
+///
+/// REGISTRY_CLASS is the Registry typedef, e.g. cudaq::QPU::RegistryType.
+///
+/// This mirrors LLVM_INSTANTIATE_REGISTRY: it provides the template
+/// definitions for Head, Tail, add_node, and begin, then forces explicit
+/// instantiation for the concrete type.
+#define CUDAQ_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                             \
+  namespace cudaq {                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
+  template <typename T>                                                        \
+  void Registry<T>::add_node(typename Registry<T>::node *N) {                  \
+    if (Tail)                                                                  \
+      Tail->Next = N;                                                          \
+    else                                                                       \
+      Head = N;                                                                \
+    Tail = N;                                                                  \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename Registry<T>::iterator Registry<T>::begin() {                        \
+    return iterator(Head);                                                     \
+  }                                                                            \
+  template                                                                     \
+      typename REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head;     \
+  template                                                                     \
+      typename REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail;     \
+  template void                                                                \
+  Registry<REGISTRY_CLASS::type>::add_node(typename REGISTRY_CLASS::node *);   \
+  template typename REGISTRY_CLASS::iterator                                   \
+  Registry<REGISTRY_CLASS::type>::begin();                                     \
+  }
+
 #define CONCAT(a, b) CONCAT_INNER(a, b)
 #define CONCAT_INNER(a, b) a##b
 
 #define CUDAQ_REGISTER_TYPE(TYPE, SUBTYPE, NAME)                               \
-  static TYPE::RegistryType::Add<SUBTYPE> CONCAT(TMPNAME_, NAME)(#NAME, "");
+  static TYPE::RegistryType::Add<SUBTYPE> CONCAT(TMPNAME_, NAME)(#NAME);
diff --git a/runtime/common/ServerHelper.cpp b/runtime/common/ServerHelper.cpp
index 1d02aba0166..e1de0ac4f72 100644
--- a/runtime/common/ServerHelper.cpp
+++ b/runtime/common/ServerHelper.cpp
@@ -40,4 +40,4 @@ void ServerHelper::parseConfigForCommonParams(const BackendConfig &config) {
 }
 } // namespace cudaq
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::ServerHelper::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::ServerHelper::RegistryType)
diff --git a/runtime/cudaq/domains/chemistry/molecule.cpp b/runtime/cudaq/domains/chemistry/molecule.cpp
index c5ecf883024..7f73f39edf9 100644
--- a/runtime/cudaq/domains/chemistry/molecule.cpp
+++ b/runtime/cudaq/domains/chemistry/molecule.cpp
@@ -30,7 +30,7 @@ struct rebind_container<
 
 #include <xtensor/xio.hpp>
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::MoleculePackageDriver::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::MoleculePackageDriver::RegistryType)
 
 namespace cudaq {
 
diff --git a/runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp b/runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
index d90b01d5bd7..352a1a83611 100644
--- a/runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
+++ b/runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
@@ -8,4 +8,4 @@
 
 #include "common/RemoteKernelExecutor.h"
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::RemoteRuntimeClient::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::RemoteRuntimeClient::RegistryType)
diff --git a/runtime/cudaq/platform/default/rest_server/RemoteRuntimeServer.cpp b/runtime/cudaq/platform/default/rest_server/RemoteRuntimeServer.cpp
index 59e5dc0576b..f0b1f3643b8 100644
--- a/runtime/cudaq/platform/default/rest_server/RemoteRuntimeServer.cpp
+++ b/runtime/cudaq/platform/default/rest_server/RemoteRuntimeServer.cpp
@@ -8,4 +8,4 @@
 
 #include "common/RemoteKernelExecutor.h"
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::RemoteRuntimeServer::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::RemoteRuntimeServer::RegistryType)
diff --git a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
index 87a8722335f..980d1018729 100644
--- a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
+++ b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
@@ -21,7 +21,7 @@
 #include <filesystem>
 #include <fstream>
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
 
 namespace {
 class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
index 4d7b059695f..25b563860cb 100644
--- a/runtime/cudaq/platform/qpu.cpp
+++ b/runtime/cudaq/platform/qpu.cpp
@@ -12,7 +12,7 @@
 
 using namespace cudaq_internal::compiler;
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::ModuleLauncher::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::ModuleLauncher::RegistryType)
 
 /// Execute a JIT-compiled kernel with provided arguments.
 ///
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index 55ea32a2c0c..81ce9e83310 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -21,7 +21,7 @@
 
 using namespace cudaq_internal::compiler;
 
-LLVM_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
+CUDAQ_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
 
 namespace cudaq {
 
diff --git a/unittests/Optimizer/DecompositionPatternsTest.cpp b/unittests/Optimizer/DecompositionPatternsTest.cpp
index b01e6760b1a..465cab3d656 100644
--- a/unittests/Optimizer/DecompositionPatternsTest.cpp
+++ b/unittests/Optimizer/DecompositionPatternsTest.cpp
@@ -230,8 +230,7 @@ void stripNamespace(std::string &debugName) {
 
 // Test 1: Verify the total number of registered decomposition patterns
 TEST_F(DecompositionPatternsTest, TotalPatternCount) {
-  auto patternEntries =
-      cudaq::DecompositionPatternType::RegistryType::entries();
+  auto patternEntries = cudaq::DecompositionPatternTypeRegistry::entries();
   unsigned int size =
       std::distance(patternEntries.begin(), patternEntries.end());
   EXPECT_EQ(size, 31) << "Expected 31 decomposition patterns, but found "
@@ -240,36 +239,32 @@ TEST_F(DecompositionPatternsTest, TotalPatternCount) {
 
 // Test 2: Verify pattern names match getDebugName()
 TEST_F(DecompositionPatternsTest, PatternNamesMatchDebugNames) {
-  auto patternEntries =
-      cudaq::DecompositionPatternType::RegistryType::entries();
+  auto patternEntries = cudaq::DecompositionPatternTypeRegistry::entries();
 
   for (auto &entry : patternEntries) {
-    auto patternName = entry.getName();
+    std::string patternName = entry.getName().str();
     // Create the pattern
-    auto patternType = cudaq::registry::get<cudaq::DecompositionPatternType>(
-        patternName.str());
+    auto patternType = entry.instantiate();
     ASSERT_NE(patternType, nullptr)
-        << "Failed to recover registered pattern type: " << patternName.str();
+        << "Failed to recover registered pattern type: " << patternName;
 
     auto pattern = patternType->create(context.get());
-    ASSERT_NE(pattern, nullptr)
-        << "Failed to create pattern: " << patternName.str();
+    ASSERT_NE(pattern, nullptr) << "Failed to create pattern: " << patternName;
 
     // Get the debug name
     auto debugName = pattern->getDebugName().str();
     stripNamespace(debugName);
 
     // Verify they match
-    EXPECT_EQ(patternName.str(), debugName)
-        << "Pattern name '" << patternName.str()
-        << "' does not match debug name '" << debugName << "'";
+    EXPECT_EQ(patternName, debugName)
+        << "Pattern name '" << patternName << "' does not match debug name '"
+        << debugName << "'";
   }
 }
 
 // Test 3: Verify metadata is consistent (source and target gates are valid)
 TEST_F(DecompositionPatternsTest, MetadataConsistency) {
-  auto patternEntries =
-      cudaq::DecompositionPatternType::RegistryType::entries();
+  auto patternEntries = cudaq::DecompositionPatternTypeRegistry::entries();
 
   for (auto &entry : patternEntries) {
     std::string patternName = entry.getName().str();
@@ -295,8 +290,7 @@ TEST_F(DecompositionPatternsTest, MetadataConsistency) {
 
 // Test 4: Verify pattern decompositions produce only target gates
 TEST_F(DecompositionPatternsTest, DecompositionProducesOnlyTargetGates) {
-  auto patternEntries =
-      cudaq::DecompositionPatternType::RegistryType::entries();
+  auto patternEntries = cudaq::DecompositionPatternTypeRegistry::entries();
 
   for (auto &entry : patternEntries) {
     std::string patternName = entry.getName().str();

From 4ebfad1605380e5049704dc6f1aaf99e7eb52847 Mon Sep 17 00:00:00 2001
From: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
Date: Mon, 27 Apr 2026 03:13:29 -0700
Subject: [PATCH 62/85] Update cmake constraint for wheel build (for upstream
 change) (#4396)

This is needed due to the recent release of `build` 1.4.4

* https://pypi.org/project/build/1.4.4/
* https://github.com/pypa/build/releases/tag/1.4.4

More specifically, the `build` package now adds `--ignore-installed`
during part of the build process, and since our cmake version pinnings
weren't consistent across `[build-system]` and `[tool.scikit-build]`,
cmake got updated to be >4 halfway through the build. CUDA-Q does not
support cmake>4.

Signed-off-by: Ben Howe <bhowe@nvidia.com>
---
 pyproject.toml.cu12 | 2 +-
 pyproject.toml.cu13 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml.cu12 b/pyproject.toml.cu12
index b07ec657757..96cd0a73ace 100644
--- a/pyproject.toml.cu12
+++ b/pyproject.toml.cu12
@@ -69,7 +69,7 @@ build-backend = "scikit_build_core.build"
 wheel.packages = ["python/cudaq"]
 build-dir = "_skbuild"
 metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
-cmake.minimum-version = "3.27"
+cmake.version = ">=3.27,<3.29"
 cmake.build-type = "Release"
 cmake.verbose = false
 cmake.args = [
diff --git a/pyproject.toml.cu13 b/pyproject.toml.cu13
index 7e9a6dd6926..fc226812534 100644
--- a/pyproject.toml.cu13
+++ b/pyproject.toml.cu13
@@ -71,7 +71,7 @@ build-backend = "scikit_build_core.build"
 wheel.packages = ["python/cudaq"]
 build-dir = "_skbuild"
 metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
-cmake.minimum-version = "3.27"
+cmake.version = ">=3.27,<3.29"
 cmake.build-type = "Release"
 cmake.verbose = false
 cmake.args = [

From 8c94fd43979099c0092272ca4a3aa217b324613a Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Mon, 27 Apr 2026 12:13:44 +0200
Subject: [PATCH 63/85] Merge lowerQuakeCode overloads (#4378)

This unifies the two overloads of `Compiler::lowerQuakeCode` into one by
moving the responsibility of loading the kernel into an MLIR ModuleOp to
the caller.

This also means that `extractQuakeCodeAndContext` is now a public method
of `Compiler`. I've taken this opportunity to rename it to
`loadQuakeCodeByName` which I found clearer. Happy to revert that.

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 runtime/common/BaseRemoteRESTQPU.h            | 18 ++++++-----
 .../platform/fermioniq/FermioniqBaseQPU.h     |  8 +++--
 runtime/internal/compiler/Compiler.cpp        | 18 +++--------
 .../cudaq_internal/compiler/Compiler.h        | 32 +++++++------------
 4 files changed, 31 insertions(+), 45 deletions(-)

diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index d15b2749bfa..ed40219880f 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -258,17 +258,19 @@ class BaseRemoteRESTQPU : public QPU {
           "Remote rest execution can only be performed via cudaq::sample(), "
           "cudaq::observe(), cudaq::run(), or cudaq::contrib::draw().");
 
+    auto [module, context] = Compiler::loadQuakeCodeByName(kernelName);
+
     // Get the Quake code, lowered according to config file.
     // FIXME: For python, we reach here with rawArgs being empty and args having
     // the arguments. Python should be using the streamlined argument synthesis,
     // but apparently it isn't. This works around that bug.
     Compiler compiler(serverHelper.get(), backendConfig, targetConfig,
                       noiseModel, emulate);
-    auto codes =
-        rawArgs.empty()
-            ? compiler.lowerQuakeCode(executionContext, kernelName, args, {})
-            : compiler.lowerQuakeCode(executionContext, kernelName, nullptr,
-                                      rawArgs);
+    auto codes = rawArgs.empty()
+                     ? compiler.lowerQuakeCode(executionContext, kernelName,
+                                               module, args, {})
+                     : compiler.lowerQuakeCode(executionContext, kernelName,
+                                               module, nullptr, rawArgs);
     completeLaunchKernel(kernelName, std::move(codes));
 
     // NB: Kernel should/will never return dynamic results.
@@ -290,9 +292,9 @@ class BaseRemoteRESTQPU : public QPU {
 
     Compiler compiler(serverHelper.get(), backendConfig, targetConfig,
                       noiseModel, emulate);
-    completeLaunchKernel(
-        kernelName,
-        compiler.lowerQuakeCode(executionContext, kernelName, module, rawArgs));
+    completeLaunchKernel(kernelName,
+                         compiler.lowerQuakeCode(executionContext, kernelName,
+                                                 module, nullptr, rawArgs));
     return {};
   }
 
diff --git a/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h b/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h
index 7e57fa35900..a4754a560a2 100644
--- a/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h
+++ b/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h
@@ -45,10 +45,12 @@ class FermioniqBaseQPU : public BaseRemoteRESTQPU {
                std::uint64_t resultOffset,
                const std::vector<void *> &rawArgs) override {
     CUDAQ_INFO("FermioniqBaseQPU launching kernel ({})", kernelName);
+    auto [module, context] = Compiler::loadQuakeCodeByName(kernelName);
     launchImpl(kernelName, [&](Compiler &compiler, ExecutionContext *ctx) {
       return rawArgs.empty()
-                 ? compiler.lowerQuakeCode(ctx, kernelName, args, {})
-                 : compiler.lowerQuakeCode(ctx, kernelName, nullptr, rawArgs);
+                 ? compiler.lowerQuakeCode(ctx, kernelName, module, args, {})
+                 : compiler.lowerQuakeCode(ctx, kernelName, module, nullptr,
+                                           rawArgs);
     });
     return {};
   }
@@ -58,7 +60,7 @@ class FermioniqBaseQPU : public BaseRemoteRESTQPU {
                const std::vector<void *> &rawArgs) override {
     CUDAQ_INFO("FermioniqBaseQPU launching kernel via module ({})", kernelName);
     launchImpl(kernelName, [&](Compiler &compiler, ExecutionContext *ctx) {
-      return compiler.lowerQuakeCode(ctx, kernelName, module, rawArgs);
+      return compiler.lowerQuakeCode(ctx, kernelName, module, nullptr, rawArgs);
     });
     return {};
   }
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index 0bfe10bddb6..a181b7ee894 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -109,7 +109,7 @@ std::vector<std::size_t> extractMappingReorderIdx(mlir::ModuleOp moduleOp,
 } // namespace
 
 std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
-Compiler::extractQuakeCodeAndContext(const std::string &kernelName) {
+Compiler::loadQuakeCodeByName(const std::string &kernelName) {
   auto context = getOwningMLIRContext();
 
   // Get the quake representation of the kernel
@@ -594,22 +594,12 @@ Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
 /// lower it to the code format required for the specific backend. The
 /// lowering process is controllable via the configuration file in the
 /// platform directory for the targeted backend.
-std::vector<cudaq::KernelExecution>
-Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
-                         const std::string &kernelName, void *kernelArgs,
-                         const std::vector<void *> &rawArgs) {
-  auto [m_module, context] = extractQuakeCodeAndContext(kernelName);
-  auto compiled = runPassPipeline(executionContext, kernelName, m_module,
-                                  rawArgs, kernelArgs, std::move(context));
-  return emitKernelExecutions(compiled);
-}
-
 std::vector<cudaq::KernelExecution>
 Compiler::lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                          const std::string &kernelName, mlir::ModuleOp module,
-                         const std::vector<void *> &rawArgs) {
-  auto compiled =
-      runPassPipeline(executionContext, kernelName, module, rawArgs);
+                         void *kernelArgs, const std::vector<void *> &rawArgs) {
+  auto compiled = runPassPipeline(executionContext, kernelName, module, rawArgs,
+                                  kernelArgs, nullptr);
   return emitKernelExecutions(compiled);
 }
 
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
index 6f00735d966..1032e9b700c 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
@@ -69,9 +69,6 @@ class Compiler {
   /// @brief Flag indicating whether we should print the IR.
   bool printIR = false;
 
-  std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
-  extractQuakeCodeAndContext(const std::string &kernelName);
-
   mlir::ModuleOp lowerQuakeCodeBuildModule(const std::string &,
                                            mlir::ModuleOp module,
                                            mlir::MLIRContext *,
@@ -103,6 +100,9 @@ class Compiler {
       std::shared_ptr<mlir::MLIRContext> context);
 
 public:
+  static std::pair<mlir::ModuleOp, std::unique_ptr<mlir::MLIRContext>>
+  loadQuakeCodeByName(const std::string &kernelName);
+
   Compiler(cudaq::ServerHelper *,
            const std::map<std::string, std::string> &backendConfig,
            cudaq::config::TargetConfig &config,
@@ -130,26 +130,18 @@ class Compiler {
   std::vector<cudaq::KernelExecution>
   emitKernelExecutions(const cudaq::CompiledModule &compiled);
 
-  /// @brief Extract the Quake representation for the given kernel name and
-  /// lower it to the code format required for the specific backend. The
-  /// lowering process is controllable via the configuration file in the
+  /// Compile the quake code passed via ModuleOp and lower it to the code format
+  /// required for the specific backend.
+  ///
+  /// The lowering process is controllable via the configuration file in the
   /// platform directory for the targeted backend.
-  std::vector<cudaq::KernelExecution>
-  lowerQuakeCode(cudaq::ExecutionContext *executionContext,
-                 const std::string &kernelName, void *kernelArgs,
-                 const std::vector<void *> &rawArgs);
-
-  // Here the quake code is passed to us (via a ModuleOp), so unlike the other
-  // lowerQuakeCode() member functions there is no need to surf dictionaries for
-  // strings of code to assemble. We have to make sure that this MLIRContext is
-  // not destroyed however, since it may hold an unknown number of other
-  // ModuleOps.
-  // Unchecked assumption: \p module is referentially unique (within the scope
-  // of this launch instance) and disposable. It can be modified by this call in
-  // any way necessary without breaking some other kernel launch.
+  ///
+  /// Unchecked assumption: there are no other references to \p module (within
+  /// the scope of this launch instance). It can be disposed and/or modified by
+  /// this call in any way necessary without breaking some other kernel launch.
   std::vector<cudaq::KernelExecution>
   lowerQuakeCode(cudaq::ExecutionContext *executionContext,
                  const std::string &kernelName, mlir::ModuleOp module,
-                 const std::vector<void *> &rawArgs);
+                 void *kernelArgs, const std::vector<void *> &rawArgs);
 };
 } // namespace cudaq_internal::compiler

From cb0dd663787350a7abfb7219fd837b2d836c15d7 Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Mon, 27 Apr 2026 12:18:06 +0200
Subject: [PATCH 64/85] [NFC] Split BaseRemoteSimulator launch into compile +
 launch (#4387)

On top of https://github.com/NVIDIA/cuda-quantum/pull/4378

This was the last place in the code base (for Python) that wasn't
producing `CompiledModule`s before launching the kernels.

---------

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 runtime/common/BaseRemoteSimulatorQPU.h | 90 +++++++++++++++++++------
 1 file changed, 71 insertions(+), 19 deletions(-)

diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 29316be3454..d20f9fb13dc 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "CompiledModule.h"
 #include "common/ExecutionContext.h"
 #include "common/RemoteKernelExecutor.h"
 #include "common/Resources.h"
@@ -21,6 +22,7 @@
 #include "cudaq/platform/quantum_platform.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq_internal/compiler/ArgumentConversion.h"
+#include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include "cudaq_internal/compiler/JIT.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
@@ -124,8 +126,11 @@ class BaseRemoteSimulatorQPU : public QPU {
   launchKernel(const std::string &name, KernelThunkType kernelFunc, void *args,
                std::uint64_t voidStarSize, std::uint64_t resultOffset,
                const std::vector<void *> &rawArgs) override {
-    return launchKernelImpl(name, kernelFunc, args, voidStarSize, resultOffset,
-                            kernelFunc ? nullptr : &rawArgs, mlir::ModuleOp{});
+    auto rawArgsPtr = kernelFunc ? nullptr : &rawArgs;
+    auto compiled = compileKernelImpl(name, args, voidStarSize, rawArgsPtr,
+                                      mlir::ModuleOp{}, mlir::Type{});
+    return launchKernelImpl(compiled, kernelFunc, args, voidStarSize,
+                            resultOffset, rawArgsPtr);
   }
 
   KernelThunkResultType
@@ -134,13 +139,10 @@ class BaseRemoteSimulatorQPU : public QPU {
     std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
     auto funcOp = module.lookupSymbol<mlir::func::FuncOp>(fullName);
     auto resTy = cudaq::runtime::getReturnType(funcOp);
-    if (resTy) {
-      // Looks very much like launchKernel(string, vector<ptr>*).
-      return launchKernelImpl(name, nullptr, rawArgs.back(), 0, 0, &rawArgs,
-                              module);
-    }
+    auto args = resTy ? rawArgs.back() : nullptr;
     // Looks very much like launchKernel(string, vector<ptr>*).
-    return launchKernelImpl(name, nullptr, nullptr, 0, 0, &rawArgs, module);
+    auto compiled = compileKernelImpl(name, args, 0, &rawArgs, module, resTy);
+    return launchKernelImpl(compiled, nullptr, args, 0, 0, &rawArgs);
   }
 
   CompiledModule specializeModule(const std::string &kernelName,
@@ -153,13 +155,15 @@ class BaseRemoteSimulatorQPU : public QPU {
         "NYI: Remote simulator execution via Python/C++ interop.");
   }
 
-  [[nodiscard]] KernelThunkResultType launchKernelImpl(
-      const std::string &name, KernelThunkType kernelFunc, void *args,
-      std::uint64_t voidStarSize, std::uint64_t resultOffset,
-      const std::vector<void *> *rawArgs, mlir::ModuleOp prefabMod) {
-    CUDAQ_INFO("BaseRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} "
-               "(simulator = {})",
-               name, qpu_id, m_simName);
+  [[nodiscard]] CompiledModule
+  compileKernelImpl(const std::string &name, void *args,
+                    std::uint64_t voidStarSize,
+                    const std::vector<void *> *rawArgs,
+                    mlir::ModuleOp prefabMod, mlir::Type resTy) {
+    CUDAQ_INFO(
+        "BaseRemoteSimulatorQPU: Compile kernel named '{}' remote QPU {} "
+        "(simulator = {})",
+        name, qpu_id, m_simName);
 
     if (in_resource_estimation)
       throw std::runtime_error(
@@ -169,9 +173,14 @@ class BaseRemoteSimulatorQPU : public QPU {
     ExecutionContext *executionContextPtr = getExecutionContext();
 
     if (executionContextPtr && executionContextPtr->name == "tracer") {
-      return {};
+      return cudaq_internal::compiler::CompiledModuleHelper::
+          createCompiledModule(name, {}, {});
     }
 
+    auto resultInfo =
+        cudaq_internal::compiler::CompiledModuleHelper::createResultInfo(
+            resTy, true, prefabMod);
+
     // Run resource estimation locally
     if (executionContextPtr && executionContextPtr->name == "resource-count") {
       in_resource_estimation = true;
@@ -190,18 +199,58 @@ class BaseRemoteSimulatorQPU : public QPU {
 
       auto jit =
           cudaq_internal::compiler::createJITEngine(moduleOp, "qir-adaptive");
+      auto artifacts =
+          cudaq_internal::compiler::CompiledModuleHelper::createJitArtifacts(
+              name, jit, {}, true);
+      auto mlirArtifact =
+          cudaq_internal::compiler::CompiledModuleHelper::createMlirArtifact(
+              name + ".mlir", moduleOp);
+      artifacts.push_back(mlirArtifact);
+      return cudaq_internal::compiler::CompiledModuleHelper::
+          createCompiledModule(name, resultInfo, std::move(artifacts));
+    }
+
+    auto mlirArtifact =
+        cudaq_internal::compiler::CompiledModuleHelper::createMlirArtifact(
+            name + ".mlir", prefabMod);
+
+    return cudaq_internal::compiler::CompiledModuleHelper::createCompiledModule(
+        name, resultInfo, {mlirArtifact});
+  }
+
+  [[nodiscard]] KernelThunkResultType
+  launchKernelImpl(const CompiledModule &compiledModule,
+                   KernelThunkType kernelFunc, void *args,
+                   std::uint64_t voidStarSize, std::uint64_t resultOffset,
+                   const std::vector<void *> *rawArgs) {
+    auto name = compiledModule.getName();
+    CUDAQ_INFO("BaseRemoteSimulatorQPU: Launch kernel named '{}' remote QPU {} "
+               "(simulator = {})",
+               name, qpu_id, m_simName);
+
+    ExecutionContext *executionContextPtr = getExecutionContext();
+
+    if (in_resource_estimation) {
+      auto jit = compiledModule.getJit();
+      assert(jit.has_value());
 
       ExecutionContext ctx(executionContextPtr->name,
                            executionContextPtr->shots,
                            executionContextPtr->qpuId);
       ctx.kernelName = executionContextPtr->kernelName;
       ctx.executionManager = cudaq::getDefaultExecutionManager();
-      cudaq::get_platform().with_execution_context(
-          ctx, [jit, name]() { jit.run(name); });
+      cudaq::get_platform().with_execution_context(ctx,
+                                                   [jit]() { jit->getFn()(); });
       in_resource_estimation = false;
       return {};
     }
 
+    auto mlir = compiledModule.getMlir();
+    if (!mlir.has_value()) {
+      assert(executionContextPtr && executionContextPtr->name == "tracer");
+      return {};
+    }
+
     // Default context for a 'fire-and-ignore' kernel launch; i.e., no context
     // was set before launching the kernel. Use a static variable per thread to
     // set up a single-shot execution context for this case.
@@ -217,12 +266,15 @@ class BaseRemoteSimulatorQPU : public QPU {
       executionContext.hasConditionalsOnMeasureResults =
           kernelHasConditionalFeedback(name);
 
+    auto moduleOp =
+        cudaq_internal::compiler::CompiledModuleHelper::getMlirModuleOp(*mlir);
+
     std::string errorMsg;
     const bool requestOkay = m_client->sendRequest(
         *m_mlirContext, executionContext,
         /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
         m_simName, name, make_degenerate_kernel_type(kernelFunc), args,
-        voidStarSize, &errorMsg, rawArgs, prefabMod);
+        voidStarSize, &errorMsg, rawArgs, moduleOp);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
     if (isDirectInvocation &&

From c938257dc5e4ccc9e67baf8068f6779e2566b7bc Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Mon, 27 Apr 2026 19:47:50 +0200
Subject: [PATCH 65/85] [NFC] Use std::span in remote client /
 argument-conversion APIs (#4397)

Change signatures in `RemoteRuntimeClient`, `ArgumentConverter::gen*`
and `mergeAllCallableClosures` to take `std::span`s instead of `const
std::vector<void *> *` or `const std::vector<void *> &`.

This is more general and casting from a reference to span is implicit,
so most call sites remain unchanged. Note that technically, a pointer to
a vector can distinguish the empty vector from no vector (`nullptr`),
but this distinction is never made in the code, so using `span`s removes
this unnecessary distinction as well.

Signed-off-by: Luca Mondada <luca@mondada.net>
Co-authored-by: Claude <claude@mondada.net>
---
 python/runtime/utils/PyRemoteSimulatorQPU.cpp |  2 +-
 runtime/common/BaseRemoteSimulatorQPU.h       |  8 ++-
 runtime/common/BaseRestRemoteClient.h         | 52 +++++++++----------
 runtime/common/RemoteKernelExecutor.h         | 11 ++--
 .../internal/compiler/ArgumentConversion.cpp  | 10 ++--
 .../compiler/ArgumentConversion.h             | 11 ++--
 6 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/python/runtime/utils/PyRemoteSimulatorQPU.cpp b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
index 8f6060afa23..e64e3f5b4f2 100644
--- a/python/runtime/utils/PyRemoteSimulatorQPU.cpp
+++ b/python/runtime/utils/PyRemoteSimulatorQPU.cpp
@@ -102,7 +102,7 @@ static void launchKernelStreamlineImpl(
   const bool requestOkay = remote_client->sendRequest(
       *mlirContext, executionContext,
       /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
-      sim_name, name, nullptr, nullptr, 0, &errorMsg, &actualArgs);
+      sim_name, name, nullptr, nullptr, 0, &errorMsg, actualArgs);
   if (!requestOkay)
     throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
 }
diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index d20f9fb13dc..1c2933838b2 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -194,7 +194,9 @@ class BaseRemoteSimulatorQPU : public QPU {
           return m_client->lowerKernelInPlace(prefabMod, name, *rawArgs);
         }
         return m_client->lowerKernel(*m_mlirContext, name, args, voidStarSize,
-                                     0, rawArgs);
+                                     0,
+                                     rawArgs ? std::span<void *const>{*rawArgs}
+                                             : std::span<void *const>{});
       }();
 
       auto jit =
@@ -274,7 +276,9 @@ class BaseRemoteSimulatorQPU : public QPU {
         *m_mlirContext, executionContext,
         /*vqe_gradient=*/nullptr, /*vqe_optimizer=*/nullptr, /*vqe_n_params=*/0,
         m_simName, name, make_degenerate_kernel_type(kernelFunc), args,
-        voidStarSize, &errorMsg, rawArgs, moduleOp);
+        voidStarSize, &errorMsg,
+        rawArgs ? std::span<void *const>{*rawArgs} : std::span<void *const>{},
+        moduleOp);
     if (!requestOkay)
       throw std::runtime_error("Failed to launch kernel. Error: " + errorMsg);
     if (isDirectInvocation &&
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 23e16f0e081..605715d02f0 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -112,7 +112,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
   lowerKernelCommon(mlir::MLIRContext &mlirContext, const std::string &name,
                     const void *args, std::uint64_t argsSize,
                     const std::size_t startingArgIdx,
-                    const std::vector<void *> *rawArgs, mlir::ModuleOp module) {
+                    std::span<void *const> rawArgs, mlir::ModuleOp module) {
     enablePrintMLIREachPass = getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
     // Extract the kernel name
     auto func = module.lookupSymbol<mlir::func::FuncOp>(
@@ -122,9 +122,8 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
       throw std::runtime_error("no kernel named " + name + " found in module");
 
     // Merge other modules (e.g., if there are device kernel calls).
-    if (rawArgs && !rawArgs->empty())
-      cudaq_internal::compiler::mergeAllCallableClosures(module, name,
-                                                         *rawArgs);
+    if (!rawArgs.empty())
+      cudaq_internal::compiler::mergeAllCallableClosures(module, name, rawArgs);
 
     // Create a new Module to clone the function into
     auto location = mlir::FileLineColLoc::get(&mlirContext, "<builder>", 1, 1);
@@ -156,12 +155,12 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
       return module;
     }();
     std::string passName;
-    if (rawArgs || args) {
+    if (!rawArgs.empty() || args) {
       mlir::PassManager pm(&mlirContext);
-      if (rawArgs && !rawArgs->empty()) {
+      if (!rawArgs.empty()) {
         CUDAQ_INFO("Run Argument Synth.\n");
         cudaq_internal::compiler::ArgumentConverter argCon(name, moduleOp);
-        argCon.gen_drop_front(*rawArgs, startingArgIdx);
+        argCon.gen_drop_front(rawArgs, startingArgIdx);
 
         // Store kernel and substitution strings on the stack.
         // We pass string references to the `createArgumentSynthesisPass`.
@@ -257,16 +256,16 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
 
   virtual mlir::ModuleOp
   lowerKernelInPlace(mlir::ModuleOp module, const std::string &name,
-                     const std::vector<void *> &rawArgs) override {
+                     std::span<void *const> rawArgs) override {
     return lowerKernelCommon</*cloneAgain=*/false>(
-        *module.getContext(), name, nullptr, 0, 0, &rawArgs, module);
+        *module.getContext(), name, nullptr, 0, 0, rawArgs, module);
   }
 
-  virtual mlir::ModuleOp
-  lowerKernel(mlir::MLIRContext &mlirContext, const std::string &name,
-              const void *args, std::uint64_t argsSize,
-              const std::size_t startingArgIdx,
-              const std::vector<void *> *rawArgs) override {
+  virtual mlir::ModuleOp lowerKernel(mlir::MLIRContext &mlirContext,
+                                     const std::string &name, const void *args,
+                                     std::uint64_t argsSize,
+                                     const std::size_t startingArgIdx,
+                                     std::span<void *const> rawArgs) override {
 
     // Get the quake representation of the kernel
     auto quakeCode = cudaq::get_quake_by_name(name);
@@ -283,7 +282,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
                                      const std::string &name, const void *args,
                                      std::uint64_t voidStarSize,
                                      std::size_t startingArgIdx,
-                                     const std::vector<void *> *rawArgs,
+                                     std::span<void *const> rawArgs,
                                      mlir::Operation *prefabMod) {
     ScopedTraceWithContext(cudaq::TIMING_JIT, "constructKernelPayload");
     mlir::ModuleOp moduleOp;
@@ -291,7 +290,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
     if (prefabMod) {
       ctx = prefabMod->getContext();
       moduleOp = lowerKernelInPlace(mlir::cast<mlir::ModuleOp>(prefabMod), name,
-                                    *rawArgs);
+                                    rawArgs);
     } else {
       ctx = &mlirContext;
       moduleOp = lowerKernel(mlirContext, name, args, voidStarSize,
@@ -314,12 +313,13 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
     return llvm::encodeBase64(mlirCode);
   }
 
-  cudaq::RestRequest constructVQEJobRequest(
-      mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context,
-      const std::string &backendSimName, const std::string &kernelName,
-      const void *kernelArgs, cudaq::gradient *gradient,
-      cudaq::optimizer &optimizer, const int n_params,
-      const std::vector<void *> *rawArgs) {
+  cudaq::RestRequest
+  constructVQEJobRequest(mlir::MLIRContext &mlirContext,
+                         cudaq::ExecutionContext &io_context,
+                         const std::string &backendSimName,
+                         const std::string &kernelName, const void *kernelArgs,
+                         cudaq::gradient *gradient, cudaq::optimizer &optimizer,
+                         const int n_params, std::span<void *const> rawArgs) {
     cudaq::RestRequest request(io_context, version());
 
     request.opt = RestRequestOptFields();
@@ -360,7 +360,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
       mlir::MLIRContext &mlirContext, cudaq::ExecutionContext &io_context,
       const std::string &backendSimName, const std::string &kernelName,
       void (*kernelFunc)(void *), const void *kernelArgs,
-      std::uint64_t argsSize, const std::vector<void *> *rawArgs,
+      std::uint64_t argsSize, std::span<void *const> rawArgs,
       mlir::Operation *prefabMod) {
 
     cudaq::RestRequest request(io_context, version());
@@ -389,11 +389,11 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
       stateIrPayload1.entryPoint = kernelName1;
       stateIrPayload1.ir =
           constructKernelPayload(mlirContext, kernelName1, nullptr, 0,
-                                 /*startingArgIdx=*/0, &args1, {});
+                                 /*startingArgIdx=*/0, args1, {});
       stateIrPayload2.entryPoint = kernelName2;
       stateIrPayload2.ir =
           constructKernelPayload(mlirContext, kernelName2, nullptr, 0,
-                                 /*startingArgIdx=*/0, &args2, {});
+                                 /*startingArgIdx=*/0, args2, {});
       // First kernel of the overlap calculation
       request.code = stateIrPayload1.ir;
       request.entryPoint = stateIrPayload1.entryPoint;
@@ -430,7 +430,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
               const int vqe_n_params, const std::string &backendSimName,
               const std::string &kernelName, void (*kernelFunc)(void *),
               const void *kernelArgs, std::uint64_t argsSize,
-              std::string *optionalErrorMsg, const std::vector<void *> *rawArgs,
+              std::string *optionalErrorMsg, std::span<void *const> rawArgs,
               mlir::Operation *prefabMod) override {
     cudaq::RestRequest request = [&]() {
       if (vqe_n_params > 0)
diff --git a/runtime/common/RemoteKernelExecutor.h b/runtime/common/RemoteKernelExecutor.h
index f323d03e6cd..15c4ed95495 100644
--- a/runtime/common/RemoteKernelExecutor.h
+++ b/runtime/common/RemoteKernelExecutor.h
@@ -17,6 +17,7 @@
 #include "common/Registry.h"
 #include "cudaq/remote_capabilities.h"
 #include <optional>
+#include <span>
 #include <string_view>
 #include <unordered_map>
 #include <vector>
@@ -97,10 +98,10 @@ class RemoteRuntimeClient
                                      const void *kernelArgs,
                                      std::uint64_t argsSize,
                                      const std::size_t startingArgIdx,
-                                     const std::vector<void *> *rawArgs) = 0;
-  virtual mlir::ModuleOp
-  lowerKernelInPlace(mlir::ModuleOp module, const std::string &shortName,
-                     const std::vector<void *> &rawArgs) = 0;
+                                     std::span<void *const> rawArgs) = 0;
+  virtual mlir::ModuleOp lowerKernelInPlace(mlir::ModuleOp module,
+                                            const std::string &shortName,
+                                            std::span<void *const> rawArgs) = 0;
 
   // Delegate/send kernel execution to a remote server.
   // Subclass will implement necessary transport-layer serialization and
@@ -113,7 +114,7 @@ class RemoteRuntimeClient
               const std::string &kernelName, void (*kernelFunc)(void *),
               const void *kernelArgs, std::uint64_t argsSize,
               std::string *optionalErrorMsg = nullptr,
-              const std::vector<void *> *rawArgs = nullptr,
+              std::span<void *const> rawArgs = {},
               mlir::Operation *prefabMod = nullptr) = 0;
   // Destructor
   virtual ~RemoteRuntimeClient() = default;
diff --git a/runtime/internal/compiler/ArgumentConversion.cpp b/runtime/internal/compiler/ArgumentConversion.cpp
index 5a9298351db..5306e447fa5 100644
--- a/runtime/internal/compiler/ArgumentConversion.cpp
+++ b/runtime/internal/compiler/ArgumentConversion.cpp
@@ -846,12 +846,12 @@ ArgumentConverter::ArgumentConverter(StringRef kernelName,
                                      ModuleOp sourceModule)
     : sourceModule(sourceModule), kernelName(kernelName) {}
 
-void ArgumentConverter::gen(const std::vector<void *> &arguments) {
+void ArgumentConverter::gen(std::span<void *const> arguments) {
   gen(kernelName, sourceModule, arguments);
 }
 
 void ArgumentConverter::gen(StringRef kernelName, ModuleOp sourceModule,
-                            const std::vector<void *> &arguments) {
+                            std::span<void *const> arguments) {
   auto *ctx = sourceModule.getContext();
   OpBuilder builder(ctx);
   ModuleOp substModule =
@@ -961,7 +961,7 @@ void ArgumentConverter::gen(StringRef kernelName, ModuleOp sourceModule,
   }
 }
 
-void ArgumentConverter::gen(const std::vector<void *> &arguments,
+void ArgumentConverter::gen(std::span<void *const> arguments,
                             const std::unordered_set<unsigned> &exclusions) {
   std::vector<void *> partialArgs;
   for (auto iter : llvm::enumerate(arguments)) {
@@ -974,7 +974,7 @@ void ArgumentConverter::gen(const std::vector<void *> &arguments,
   gen(partialArgs);
 }
 
-void ArgumentConverter::gen_drop_front(const std::vector<void *> &arguments,
+void ArgumentConverter::gen_drop_front(std::span<void *const> arguments,
                                        unsigned numDrop) {
   // If we're dropping all the arguments, we're done.
   if (numDrop >= arguments.size())
@@ -994,7 +994,7 @@ void ArgumentConverter::gen_drop_front(const std::vector<void *> &arguments,
 
 bool cudaq_internal::compiler::mergeAllCallableClosures(
     ModuleOp intoModule, const std::string &shortName,
-    const std::vector<void *> &rawArgs, std::optional<unsigned> betaRedux) {
+    std::span<void *const> rawArgs, std::optional<unsigned> betaRedux) {
   if (rawArgs.empty())
     return false;
   auto fullName = cudaq::runtime::cudaqGenPrefixName + shortName;
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
index 0ce46ab89d2..8abf11f7096 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
@@ -13,6 +13,7 @@
 #include "cudaq/qis/state.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
+#include <span>
 #include <unordered_set>
 #include <vector>
 
@@ -60,21 +61,21 @@ class ArgumentConverter {
 
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
-  void gen(const std::vector<void *> &arguments);
+  void gen(std::span<void *const> arguments);
 
   /// Generate a substitution ModuleOp for the vector of arguments presented.
   /// The arguments are those presented to the kernel, kernelName.
   void gen(mlir::StringRef kernelName, mlir::ModuleOp sourceModule,
-           const std::vector<void *> &arguments);
+           std::span<void *const> arguments);
 
   /// Generate a substitution ModuleOp but include only the arguments that do
   /// not appear in the set of \p exclusions.
-  void gen(const std::vector<void *> &arguments,
+  void gen(std::span<void *const> arguments,
            const std::unordered_set<unsigned> &exclusions);
 
   /// Generate a substitution ModuleOp but drop the first \p numDrop arguments
   /// and thereby exclude them from the substitutions.
-  void gen_drop_front(const std::vector<void *> &arguments, unsigned numDrop);
+  void gen_drop_front(std::span<void *const> arguments, unsigned numDrop);
 
   /// Get the kernel info that were collected by `gen()`.
   mlir::SmallVector<KernelSubstitutionInfo *> &getKernelSubstitutions() {
@@ -123,7 +124,7 @@ class ArgumentConverter {
 /// Return <code>true</code> if and only if \p intoModule has been modified.
 bool mergeAllCallableClosures(mlir::ModuleOp intoModule,
                               const std::string &shortName,
-                              const std::vector<void *> &rawArgs,
+                              std::span<void *const> rawArgs,
                               std::optional<unsigned> betaRedux = {});
 
 } // namespace cudaq_internal::compiler

From 18e3a3b270116c129dafc6d9c5000cc432f2162f Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Mon, 27 Apr 2026 19:49:43 +0200
Subject: [PATCH 66/85] Introduce QPU::compileModule, use it for launch +
 specialize (#4388)

This removes `specializeModule` within `QPU` and `quantum_platform`.
Instead, both when specializing and launching a kernel from Python,
`QPU::compileModule` is called.

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  6 +-
 runtime/common/BaseRemoteRESTQPU.h            | 32 +++++-----
 runtime/common/BaseRemoteSimulatorQPU.h       | 26 ++++-----
 .../platform/fermioniq/FermioniqBaseQPU.h     | 58 ++++++++++++++-----
 runtime/cudaq/platform/nvqpp_interface.h      | 21 ++-----
 runtime/cudaq/platform/qpu.cpp                | 19 +++---
 runtime/cudaq/platform/qpu.h                  |  6 +-
 runtime/cudaq/platform/quantum_platform.cpp   | 39 +++++--------
 runtime/cudaq/platform/quantum_platform.h     | 16 ++---
 9 files changed, 120 insertions(+), 103 deletions(-)

diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index eb2dd7f63d5..1ea0a558893 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -18,6 +18,7 @@
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/platform.h"
+#include "cudaq/platform/nvqpp_interface.h"
 #include "cudaq/platform/qpu.h"
 #include "cudaq_internal/compiler/ArgumentConversion.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
@@ -674,7 +675,8 @@ static cudaq::KernelThunkResultType
 pyLaunchModule(const std::string &name, ModuleOp mod,
                const std::vector<void *> &rawArgs) {
   auto clone = mod.clone();
-  auto res = cudaq::streamlinedLaunchModule(name, clone, rawArgs);
+  auto compiled = cudaq::streamlinedCompileModule(name, clone, rawArgs, true);
+  auto res = cudaq::streamlinedLaunchModule(compiled, rawArgs);
   clone.erase();
   return res;
 }
@@ -943,7 +945,7 @@ marshal_and_retain_module(const std::string &name, MlirModule module,
   auto rawArgs = appendResultToArgsVector(args, retTy, mod, name);
   auto clone = mod.clone();
   auto compiled =
-      cudaq::streamlinedSpecializeModule(name, clone, rawArgs, isEntryPoint);
+      cudaq::streamlinedCompileModule(name, clone, rawArgs, isEntryPoint);
   clone.erase();
   return compiled;
 }
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index ed40219880f..fdc8d6b71cd 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -278,10 +278,23 @@ class BaseRemoteRESTQPU : public QPU {
   }
 
   KernelThunkResultType
-  launchModule(const std::string &kernelName, mlir::ModuleOp module,
+  launchModule(const CompiledModule &compiled,
                const std::vector<void *> &rawArgs) override {
-    CUDAQ_INFO("launching remote rest kernel via module ({})", kernelName);
+    CUDAQ_INFO("launching remote rest kernel via module ({})",
+               compiled.getName());
 
+    Compiler compiler(serverHelper.get(), backendConfig, targetConfig,
+                      noiseModel, emulate);
+    auto codes = compiler.emitKernelExecutions(compiled);
+    completeLaunchKernel(compiled.getName(), std::move(codes));
+    return {};
+  }
+
+  CompiledModule compileModule(const std::string &kernelName,
+                               mlir::ModuleOp module,
+                               const std::vector<void *> &rawArgs,
+                               bool isEntryPoint) override {
+    CUDAQ_INFO("specializing remote rest kernel via module ({})", kernelName);
     auto executionContext = cudaq::getExecutionContext();
 
     // TODO future iterations of this should support non-void return types.
@@ -292,19 +305,8 @@ class BaseRemoteRESTQPU : public QPU {
 
     Compiler compiler(serverHelper.get(), backendConfig, targetConfig,
                       noiseModel, emulate);
-    completeLaunchKernel(kernelName,
-                         compiler.lowerQuakeCode(executionContext, kernelName,
-                                                 module, nullptr, rawArgs));
-    return {};
-  }
-
-  CompiledModule specializeModule(const std::string &kernelName,
-                                  mlir::ModuleOp module,
-                                  const std::vector<void *> &rawArgs,
-                                  bool isEntryPoint) override {
-    CUDAQ_INFO("specializing remote rest kernel via module ({})", kernelName);
-    throw std::runtime_error(
-        "NYI: Remote rest execution via Python/C++ interop.");
+    return compiler.runPassPipeline(executionContext, kernelName, module,
+                                    rawArgs, nullptr);
   }
 
   void completeLaunchKernel(const std::string &kernelName,
diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 1c2933838b2..7761f5f017c 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -134,25 +134,25 @@ class BaseRemoteSimulatorQPU : public QPU {
   }
 
   KernelThunkResultType
-  launchModule(const std::string &name, mlir::ModuleOp module,
+  launchModule(const CompiledModule &compiled,
                const std::vector<void *> &rawArgs) override {
-    std::string fullName = cudaq::runtime::cudaqGenPrefixName + name;
-    auto funcOp = module.lookupSymbol<mlir::func::FuncOp>(fullName);
-    auto resTy = cudaq::runtime::getReturnType(funcOp);
-    auto args = resTy ? rawArgs.back() : nullptr;
-    // Looks very much like launchKernel(string, vector<ptr>*).
-    auto compiled = compileKernelImpl(name, args, 0, &rawArgs, module, resTy);
+    auto resultInfo = compiled.getResultInfo();
+    auto args = resultInfo.hasResult() ? rawArgs.back() : nullptr;
     return launchKernelImpl(compiled, nullptr, args, 0, 0, &rawArgs);
   }
 
-  CompiledModule specializeModule(const std::string &kernelName,
-                                  mlir::ModuleOp module,
-                                  const std::vector<void *> &rawArgs,
-                                  bool isEntryPoint) override {
+  CompiledModule compileModule(const std::string &kernelName,
+                               mlir::ModuleOp module,
+                               const std::vector<void *> &rawArgs,
+                               bool isEntryPoint) override {
     CUDAQ_INFO("specializing remote simulator kernel via module ({})",
                kernelName);
-    throw std::runtime_error(
-        "NYI: Remote simulator execution via Python/C++ interop.");
+    std::string fullName = cudaq::runtime::cudaqGenPrefixName + kernelName;
+    auto funcOp = module.lookupSymbol<mlir::func::FuncOp>(fullName);
+    auto resTy = cudaq::runtime::getReturnType(funcOp);
+    auto args = resTy ? rawArgs.back() : nullptr;
+    // Looks very much like launchKernel(string, vector<ptr>*).
+    return compileKernelImpl(kernelName, args, 0, &rawArgs, module, resTy);
   }
 
   [[nodiscard]] CompiledModule
diff --git a/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h b/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h
index a4754a560a2..c8732b4723c 100644
--- a/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h
+++ b/runtime/cudaq/platform/fermioniq/FermioniqBaseQPU.h
@@ -46,31 +46,44 @@ class FermioniqBaseQPU : public BaseRemoteRESTQPU {
                const std::vector<void *> &rawArgs) override {
     CUDAQ_INFO("FermioniqBaseQPU launching kernel ({})", kernelName);
     auto [module, context] = Compiler::loadQuakeCodeByName(kernelName);
-    launchImpl(kernelName, [&](Compiler &compiler, ExecutionContext *ctx) {
+    auto compiled = compileImpl(kernelName, [&](Compiler &compiler,
+                                                ExecutionContext *ctx) {
       return rawArgs.empty()
-                 ? compiler.lowerQuakeCode(ctx, kernelName, module, args, {})
-                 : compiler.lowerQuakeCode(ctx, kernelName, module, nullptr,
-                                           rawArgs);
+                 ? compiler.runPassPipeline(ctx, kernelName, module, {}, args,
+                                            std::move(context))
+                 : compiler.runPassPipeline(ctx, kernelName, module, rawArgs,
+                                            nullptr, std::move(context));
     });
+    launchImpl(compiled);
     return {};
   }
 
   KernelThunkResultType
-  launchModule(const std::string &kernelName, mlir::ModuleOp module,
+  launchModule(const CompiledModule &compiled,
                const std::vector<void *> &rawArgs) override {
-    CUDAQ_INFO("FermioniqBaseQPU launching kernel via module ({})", kernelName);
-    launchImpl(kernelName, [&](Compiler &compiler, ExecutionContext *ctx) {
-      return compiler.lowerQuakeCode(ctx, kernelName, module, nullptr, rawArgs);
-    });
+    CUDAQ_INFO("FermioniqBaseQPU launching kernel via module ({})",
+               compiled.getName());
+    launchImpl(compiled);
     return {};
   }
 
+  CompiledModule compileModule(const std::string &kernelName,
+                               mlir::ModuleOp module,
+                               const std::vector<void *> &rawArgs,
+                               bool isEntryPoint) override {
+    CUDAQ_INFO("FermioniqBaseQPU compiling kernel via module ({})", kernelName);
+    return compileImpl(kernelName,
+                       [&](Compiler &compiler, ExecutionContext *ctx) {
+                         return compiler.runPassPipeline(
+                             ctx, kernelName, module, rawArgs, nullptr);
+                       });
+  }
+
 private:
-  void
-  launchImpl(const std::string &kernelName,
-             std::function<std::vector<KernelExecution>(Compiler &,
-                                                        ExecutionContext *)>
-                 lower) {
+  CompiledModule
+  compileImpl(const std::string &kernelName,
+              std::function<CompiledModule(Compiler &, ExecutionContext *)>
+                  runPassPipeline) {
     auto *executionContext = getExecutionContext();
     // TODO future iterations of this should support non-void return types.
     if (!executionContext)
@@ -90,7 +103,20 @@ class FermioniqBaseQPU : public BaseRemoteRESTQPU {
 
     Compiler compiler(serverHelper.get(), backendConfig, targetConfig,
                       noiseModel, emulate);
-    auto codes = lower(compiler, compileCtx);
+    return runPassPipeline(compiler, compileCtx);
+  }
+
+  void launchImpl(const CompiledModule &compiled) {
+    Compiler compiler(serverHelper.get(), backendConfig, targetConfig,
+                      noiseModel, emulate);
+    auto *executionContext = getExecutionContext();
+    // TODO future iterations of this should support non-void return types.
+    if (!executionContext)
+      throw std::runtime_error(
+          "Remote rest execution can only be performed via cudaq::sample(), "
+          "cudaq::observe(), or cudaq::contrib::draw().");
+
+    auto codes = compiler.emitKernelExecutions(compiled);
 
     if (codes.size() != 1)
       throw std::runtime_error("Provider only allows 1 circuit at a time.");
@@ -113,7 +139,7 @@ class FermioniqBaseQPU : public BaseRemoteRESTQPU {
       codes[0].user_data = user_data;
     }
 
-    completeLaunchKernel(kernelName, std::move(codes));
+    completeLaunchKernel(compiled.getName(), std::move(codes));
   }
 };
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/nvqpp_interface.h b/runtime/cudaq/platform/nvqpp_interface.h
index 3ee4dcb64bc..183f5e667cd 100644
--- a/runtime/cudaq/platform/nvqpp_interface.h
+++ b/runtime/cudaq/platform/nvqpp_interface.h
@@ -43,6 +43,7 @@ streamlinedLaunchKernel(const char *kernelName,
 hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
                    std::uint64_t argsSize, std::uint64_t resultOffset,
                    const std::vector<void *> &rawArgs);
+} // extern "C"
 
 //===----------------------------------------------------------------------===//
 // Launch module entry points.
@@ -53,23 +54,13 @@ hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
 // directly.
 //===----------------------------------------------------------------------===//
 
-// Streamlined interface for launching kernels. Argument synthesis and JIT
-// compilation *must* happen on the local machine. The caller must provide an
-// mlir::ModuleOp and the short name of the entry point kernel function to be
-// called,
-[[nodiscard]] KernelThunkResultType
-streamlinedLaunchModule(const char *kernelName, mlir::ModuleOp moduleOp,
-                        const std::vector<void *> &rawArgs);
-
-} // extern "C"
-
-// Convenience overload.
+// Streamlined interface for launching kernels.
 [[nodiscard]] KernelThunkResultType
-streamlinedLaunchModule(const std::string &kernelName, mlir::ModuleOp moduleOp,
+streamlinedLaunchModule(const CompiledModule &compiled,
                         const std::vector<void *> &rawArgs);
 
-[[nodiscard]] CompiledModule streamlinedSpecializeModule(
-    const std::string &kernelName, mlir::ModuleOp moduleOp,
-    const std::vector<void *> &rawArgs, bool isEntryPoint);
+[[nodiscard]] CompiledModule
+streamlinedCompileModule(const std::string &kernelName, mlir::ModuleOp moduleOp,
+                         const std::vector<void *> &rawArgs, bool isEntryPoint);
 
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
index 25b563860cb..a80316d1cf0 100644
--- a/runtime/cudaq/platform/qpu.cpp
+++ b/runtime/cudaq/platform/qpu.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "qpu.h"
+#include "common/CompiledModule.h"
 #include "mlir/IR/BuiltinOps.h"
 #include <cstring>
 
@@ -53,27 +54,27 @@ launchCompiledModule(const cudaq::CompiledModule &compiled,
 }
 
 cudaq::KernelThunkResultType
-cudaq::QPU::launchModule(const std::string &name, mlir::ModuleOp module,
+cudaq::QPU::launchModule(const CompiledModule &module,
                          const std::vector<void *> &rawArgs) {
   auto launcher = registry::get<ModuleLauncher>("default");
   if (!launcher)
     throw std::runtime_error(
         "No ModuleLauncher registered with name 'default'. This may be a "
         "result of attempting to use `launchModule` outside Python.");
-  ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule", name);
-  auto compiled = launcher->compileModule(name, module, rawArgs, true);
-  return launchCompiledModule(compiled, rawArgs);
+  ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::launchModule",
+                         module.getName());
+  return launchCompiledModule(module, rawArgs);
 }
 
 cudaq::CompiledModule
-cudaq::QPU::specializeModule(const std::string &name, mlir::ModuleOp module,
-                             const std::vector<void *> &rawArgs,
-                             bool isEntryPoint) {
+cudaq::QPU::compileModule(const std::string &name, mlir::ModuleOp module,
+                          const std::vector<void *> &rawArgs,
+                          bool isEntryPoint) {
   auto launcher = registry::get<ModuleLauncher>("default");
   if (!launcher)
     throw std::runtime_error(
         "No ModuleLauncher registered with name 'default'. This may be a "
-        "result of attempting to use `specializeModule` outside Python.");
-  ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::specializeModule", name);
+        "result of attempting to use `compileModule` outside Python.");
+  ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::compileModule", name);
   return launcher->compileModule(name, module, rawArgs, isEntryPoint);
 }
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
index d2dee5305d1..4cb6576a51c 100644
--- a/runtime/cudaq/platform/qpu.h
+++ b/runtime/cudaq/platform/qpu.h
@@ -194,12 +194,12 @@ class QPU : public registry::RegisteredType<QPU> {
                const std::vector<void *> &rawArgs) = 0;
 
   [[nodiscard]] virtual KernelThunkResultType
-  launchModule(const std::string &name, mlir::ModuleOp module,
+  launchModule(const CompiledModule &compiled,
                const std::vector<void *> &rawArgs);
 
   [[nodiscard]] virtual CompiledModule
-  specializeModule(const std::string &name, mlir::ModuleOp module,
-                   const std::vector<void *> &rawArgs, bool isEntryPoint);
+  compileModule(const std::string &name, mlir::ModuleOp module,
+                const std::vector<void *> &rawArgs, bool isEntryPoint);
 
   /// @brief Notify the QPU that a new random seed value is set.
   /// By default do nothing, let subclasses override.
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index 81ce9e83310..daf054adaef 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/platform/quantum_platform.h"
+#include "common/CompiledModule.h"
 #include "common/ExecutionContext.h"
 #include "common/PluginUtils.h"
 #include "common/RuntimeTarget.h"
@@ -223,20 +224,21 @@ KernelThunkResultType quantum_platform::launchKernel(
                            resultOffset, rawArgs);
 }
 
-KernelThunkResultType quantum_platform::launchModule(
-    const std::string &kernelName, mlir::ModuleOp module,
-    const std::vector<void *> &rawArgs, std::size_t qpu_id) {
+KernelThunkResultType
+quantum_platform::launchModule(const CompiledModule &module,
+                               const std::vector<void *> &rawArgs,
+                               std::size_t qpu_id) {
   validateQpuId(qpu_id);
   auto &qpu = platformQPUs[qpu_id];
-  return qpu->launchModule(kernelName, module, rawArgs);
+  return qpu->launchModule(module, rawArgs);
 }
 
-CompiledModule quantum_platform::specializeModule(
+CompiledModule quantum_platform::compileModule(
     const std::string &kernelName, mlir::ModuleOp module,
     const std::vector<void *> &rawArgs, std::size_t qpu_id, bool isEntryPoint) {
   validateQpuId(qpu_id);
   auto &qpu = platformQPUs[qpu_id];
-  return qpu->specializeModule(kernelName, module, rawArgs, isEntryPoint);
+  return qpu->compileModule(kernelName, module, rawArgs, isEntryPoint);
 }
 
 void quantum_platform::onRandomSeedSet(std::size_t seed) {
@@ -312,36 +314,27 @@ cudaq::streamlinedLaunchKernel(const char *kernelName,
   return {};
 }
 
-// FIXME: make this an inline function in nvqpp_interface.h. Requires ModuleOp
-// definition be available in that .h file though.
 cudaq::KernelThunkResultType
-cudaq::streamlinedLaunchModule(const char *kernelName, mlir::ModuleOp moduleOp,
+cudaq::streamlinedLaunchModule(const CompiledModule &compiled,
                                const std::vector<void *> &rawArgs) {
-  std::string name = kernelName;
-  return streamlinedLaunchModule(name, moduleOp, rawArgs);
-}
-
-cudaq::KernelThunkResultType
-cudaq::streamlinedLaunchModule(const std::string &kernelName,
-                               mlir::ModuleOp moduleOp,
-                               const std::vector<void *> &rawArgs) {
-  ScopedTraceWithContext("streamlinedLaunchModule", kernelName, rawArgs.size());
+  ScopedTraceWithContext("streamlinedLaunchModule", compiled.getName(),
+                         rawArgs.size());
 
   auto &platform = *getQuantumPlatformInternal();
   std::size_t qpu_id = getCurrentQpuId();
-  return platform.launchModule(kernelName, moduleOp, rawArgs, qpu_id);
+  return platform.launchModule(compiled, rawArgs, qpu_id);
 }
 
-cudaq::CompiledModule cudaq::streamlinedSpecializeModule(
+cudaq::CompiledModule cudaq::streamlinedCompileModule(
     const std::string &kernelName, mlir::ModuleOp moduleOp,
     const std::vector<void *> &rawArgs, bool isEntryPoint) {
-  ScopedTraceWithContext("streamlinedSpecializeModule", kernelName,
+  ScopedTraceWithContext("streamlinedCompileModule", kernelName,
                          rawArgs.size());
 
   auto &platform = *getQuantumPlatformInternal();
   std::size_t qpu_id = getCurrentQpuId();
-  return platform.specializeModule(kernelName, moduleOp, rawArgs, qpu_id,
-                                   isEntryPoint);
+  return platform.compileModule(kernelName, moduleOp, rawArgs, qpu_id,
+                                isEntryPoint);
 }
 
 cudaq::KernelThunkResultType
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index b67a92892c9..0e2d38c808a 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "common/CodeGenConfig.h"
+#include "common/CompiledModule.h"
 #include "common/ExecutionContext.h"
 #include "common/NoiseModel.h"
 #include "common/ObserveResult.h"
@@ -204,13 +205,14 @@ class quantum_platform {
   // This method launches a kernel from a ModuleOp that has already been
   // created.
   [[nodiscard]] KernelThunkResultType
-  launchModule(const std::string &kernelName, mlir::ModuleOp module,
-               const std::vector<void *> &rawArgs, std::size_t qpu_id);
-
-  [[nodiscard]] CompiledModule
-  specializeModule(const std::string &kernelName, mlir::ModuleOp module,
-                   const std::vector<void *> &rawArgs, std::size_t qpu_id,
-                   bool isEntryPoint);
+  launchModule(const CompiledModule &module, const std::vector<void *> &rawArgs,
+               std::size_t qpu_id);
+
+  [[nodiscard]] CompiledModule compileModule(const std::string &kernelName,
+                                             mlir::ModuleOp module,
+                                             const std::vector<void *> &rawArgs,
+                                             std::size_t qpu_id,
+                                             bool isEntryPoint);
 
   /// List all available platforms
   static std::vector<std::string> list_platforms();

From 56a5f8652bf6dc5a429fef50659e985fe73de200 Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Mon, 27 Apr 2026 15:19:27 -0700
Subject: [PATCH 67/85] update realtime NOTICE (#4400)

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 realtime/NOTICE | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/realtime/NOTICE b/realtime/NOTICE
index c63cbdcdcc9..a4490331a97 100644
--- a/realtime/NOTICE
+++ b/realtime/NOTICE
@@ -6,11 +6,26 @@ This product includes software developed by NVIDIA corporation and affiliates an
 
 ----------------------------------------------------------------
 
-Holoscan Sensor Bridge - Apache License 2.0
+DOCA - NVIDIA EULA
+<https://developer.nvidia.com/networking/doca>
+
+Copyright (c) 2026 NVIDIA Corporation (Mellanox Technologies, Ltd.)
+License at <https://docs.nvidia.com/doca/sdk/doca+eula/index.html>
+
+----------------------------------------------------------------
+
+Holoscan SDK - Apache License 2.0
+<https://github.com/nvidia-holoscan/holoscan-sdk>
+
+Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES.
+License at <https://github.com/nvidia-holoscan/holoscan-sdk/blob/main/LICENSE.txt>
+
+----------------------------------------------------------------
+
+Holoscan Sensor Bridge (HSB) - Apache License 2.0
 <https://github.com/nvidia-holoscan/holoscan-sensor-bridge>
 
-Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
 License at <https://github.com/nvidia-holoscan/holoscan-sensor-bridge/blob/main/LICENSE>
 
 ----------------------------------------------------------------
-

From 2a04b79e680af121e47161a45b2ea85d16ad5039 Mon Sep 17 00:00:00 2001
From: Thomas Alexander <thomasalexander2718@gmail.com>
Date: Mon, 27 Apr 2026 21:36:48 -0300
Subject: [PATCH 68/85] Release the GIL during CUDA-Q compilation to enable
 Python signal handling (Ctrl+C)  (#4284)

This branch is based on #4241 and should only be merged after it is
merged on upstream/main and rebased.

When Python calls into CUDA-Q for kernel compilation, the GIL is held
for the entire duration. This means that `Ctrl+C` does nothing. Python
signal handlers (including KeyboardInterrupt) can only run when the GIL
is held by a Python thread. Since compilation holds the GIL in C++ the
whole time, pressing Ctrl+C is silently queued and never delivered until
compilation finishes. For large circuits or complex pass pipelines, this
can mean minutes of uninterruptible execution. Another side effect is
that there is no python thread concurrency. Other python threads
(progress bars, async I/O, timeouts) are blocked from running during
compilation.

This PR updates the execution to release the GIL at Python-to-C++ entry
points and check for pending Python signals between MLIR passes through
instrumentation of the pass pipelines.

The interruption granularity is at a per-pass level, so a single
long-running pass will still block until it completes (which can be an
big issue with SABRE right now.
Ultimately I had to work around this in benchmarking by forking for my
use-case). The reason being that MLIR/LLVM is built with
`-fno-exceptions`, so signals cannot safely unwind through pass
execution. Instead, pending signals are detected between passes via
PyErr_CheckSignals and converted to an MLIR error diagnostic that stops
the pipeline through normal control flow.

---------

Signed-off-by: Thomas Alexander <talexander@nvidia.com>
---
 python/extension/CMakeLists.txt               |  9 +++
 python/runtime/common/py_NoiseModel.cpp       |  3 +
 python/runtime/cudaq/algorithms/py_run.cpp    | 15 ++++-
 .../cudaq/algorithms/py_sample_ptsbe.cpp      |  1 +
 .../cudaq/platform/PythonSignalCheck.cpp      | 56 +++++++++++++++++++
 .../cudaq/platform/PythonSignalCheck.h        | 36 ++++++++++++
 .../cudaq/platform/py_alt_launch_kernel.cpp   | 30 ++++++----
 runtime/cudaq/platform/default/python/QPU.cpp | 21 ++++---
 runtime/internal/compiler/Compiler.cpp        |  9 ++-
 runtime/internal/compiler/JIT.cpp             |  3 +-
 runtime/internal/compiler/RuntimeCppMLIR.cpp  |  6 ++
 runtime/internal/compiler/RuntimeMLIR.cpp     |  6 +-
 runtime/internal/compiler/RuntimePyMLIR.cpp   | 15 +++++
 .../include/cudaq_internal/compiler/JIT.h     |  1 +
 .../cudaq_internal/compiler/RuntimeMLIR.h     |  8 +++
 15 files changed, 191 insertions(+), 28 deletions(-)
 create mode 100644 python/runtime/cudaq/platform/PythonSignalCheck.cpp
 create mode 100644 python/runtime/cudaq/platform/PythonSignalCheck.h

diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index d5bad6f9ac1..3d4352bbfae 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -98,6 +98,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../runtime/cudaq/algorithms/py_translate.cpp
     ../runtime/cudaq/algorithms/py_unitary.cpp
     ../runtime/cudaq/algorithms/py_utils.cpp
+    ../runtime/cudaq/platform/PythonSignalCheck.cpp
     ../runtime/cudaq/platform/py_alt_launch_kernel.cpp
     ../runtime/cudaq/qis/py_execution_manager.cpp
     ../runtime/cudaq/qis/py_pauli_word.cpp
@@ -150,6 +151,14 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
    cudaq-mlir-runtime-headers
 )
 
+# MLIR/LLVM is built without RTTI (LLVM_ENABLE_RTTI=OFF). This file subclasses
+# mlir::PassInstrumentation, so it must match LLVM's RTTI setting.
+set_source_files_properties(
+    ../runtime/cudaq/platform/PythonSignalCheck.cpp
+    PROPERTIES COMPILE_FLAGS "-fno-rtti -fno-exceptions"
+)
+
+
 target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE
     ${CMAKE_SOURCE_DIR}/python
     ${CMAKE_SOURCE_DIR}/python/utils
diff --git a/python/runtime/common/py_NoiseModel.cpp b/python/runtime/common/py_NoiseModel.cpp
index cf4f96b85cc..acca8277b64 100644
--- a/python/runtime/common/py_NoiseModel.cpp
+++ b/python/runtime/common/py_NoiseModel.cpp
@@ -132,6 +132,9 @@ void bindNoiseModel(nanobind::module_ &mod) {
             auto key = nanobind::hash(krausT);
             std::function<kraus_channel(const std::vector<double> &)> lambda =
                 [krausT](const std::vector<double> &p) -> kraus_channel {
+              // Invoked from kernel execution with the GIL released; reacquire
+              // before calling back into Python.
+              nanobind::gil_scoped_acquire acquire;
               return nanobind::cast<kraus_channel>(krausT(p));
             };
             self.register_channel(key, lambda);
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index 5609ebe325a..fb09b0b8e7a 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -118,8 +118,12 @@ run_impl(const std::string &shortName, MlirModule module,
   auto fnOp = getFuncOpAndCheckResult(mod, shortName);
   auto opaques = marshal_arguments_for_module_launch(mod, runtimeArgs, fnOp);
 
-  auto span = pyRunTheKernel(shortName, platform, mod, shots_count, qpu_id,
-                             opaques, true);
+  details::RunResultSpan span;
+  {
+    nanobind::gil_scoped_release release;
+    span = pyRunTheKernel(shortName, platform, mod, shots_count, qpu_id,
+                          opaques, true);
+  }
   auto results = pyReadResults(span, mod, shots_count, shortName);
 
   if (noise_model.has_value())
@@ -260,7 +264,12 @@ void cudaq::bindPyRunAsync(nanobind::module_ &mod) {
       .def(
           "get",
           [](async_run_result &self) {
-            self.ready.get();
+            {
+              // Release the GIL so the async task's MLIR worker threads
+              // can call PyGILState_Ensure without deadlocking on us.
+              nanobind::gil_scoped_release release;
+              self.ready.get();
+            }
             auto err = *self.error;
             if (!err.empty()) {
               delete self.error;
diff --git a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
index de629311901..7d0c58e3b16 100644
--- a/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
+++ b/python/runtime/cudaq/algorithms/py_sample_ptsbe.cpp
@@ -71,6 +71,7 @@ pySamplePTSBE(const std::string &shortName, MlirModule module,
 
   ptsbe::sample_result result;
   try {
+    nanobind::gil_scoped_release release;
     result = ptsbe::detail::runSamplingPTSBE(
         [&]() mutable {
           [[maybe_unused]] auto res =
diff --git a/python/runtime/cudaq/platform/PythonSignalCheck.cpp b/python/runtime/cudaq/platform/PythonSignalCheck.cpp
new file mode 100644
index 00000000000..3b95f3ae681
--- /dev/null
+++ b/python/runtime/cudaq/platform/PythonSignalCheck.cpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Compiled with -fno-rtti -fno-exceptions to match LLVM build configuration
+// (LLVM_ENABLE_RTTI=OFF, LLVM_ENABLE_EH=OFF). Uses Python C API directly.
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Pass/PassInstrumentation.h"
+#include "mlir/Pass/PassManager.h"
+#include <Python.h>
+
+namespace {
+class PythonSignalCheckInstrumentation : public mlir::PassInstrumentation {
+  bool signalDetected = false;
+
+public:
+  void runBeforePipeline(std::optional<mlir::OperationName>,
+                         const PipelineParentInfo &) override {
+    signalDetected = false;
+  }
+
+  void runAfterPass(mlir::Pass *, mlir::Operation *op) override {
+    if (signalDetected)
+      return;
+    auto gstate = PyGILState_Ensure();
+    if (PyErr_CheckSignals() != 0) {
+      signalDetected = true;
+      PyErr_Clear();
+      op->emitError("compilation interrupted by Python signal");
+    }
+    PyGILState_Release(gstate);
+  }
+};
+} // namespace
+
+namespace cudaq {
+void addPythonSignalInstrumentation(mlir::PassManager &pm) {
+  pm.addInstrumentation(std::make_unique<PythonSignalCheckInstrumentation>());
+}
+
+mlir::LogicalResult runPassManagerReleasingGIL(mlir::PassManager &pm,
+                                               mlir::Operation *op) {
+  if (!PyGILState_Check())
+    return pm.run(op);
+  PyThreadState *save = PyEval_SaveThread();
+  mlir::LogicalResult result = pm.run(op);
+  PyEval_RestoreThread(save);
+  return result;
+}
+} // namespace cudaq
diff --git a/python/runtime/cudaq/platform/PythonSignalCheck.h b/python/runtime/cudaq/platform/PythonSignalCheck.h
new file mode 100644
index 00000000000..23cb24a8e9b
--- /dev/null
+++ b/python/runtime/cudaq/platform/PythonSignalCheck.h
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#ifndef CUDAQ_PYTHON_SIGNAL_CHECK_H
+#define CUDAQ_PYTHON_SIGNAL_CHECK_H
+
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+class PassManager;
+class Operation;
+} // namespace mlir
+
+namespace cudaq {
+
+/// Add instrumentation that checks for pending Python signals between passes.
+/// When a signal is pending, emits an MLIR error diagnostic to stop the
+/// pipeline. The error message propagates through normal MLIR error handling.
+void addPythonSignalInstrumentation(mlir::PassManager &pm);
+
+/// Run `pm` on `op`, releasing the Python GIL for the duration of the run.
+/// MLIR runs nested passes in parallel via its context thread pool. Workers
+/// call PyGILState_Ensure via the signal-check instrumentation and would
+/// otherwise deadlock against a main thread that still holds the GIL. Safe
+/// (idempotent) when the GIL is already released by an outer caller.
+mlir::LogicalResult runPassManagerReleasingGIL(mlir::PassManager &pm,
+                                               mlir::Operation *op);
+
+} // namespace cudaq
+
+#endif // CUDAQ_PYTHON_SIGNAL_CHECK_H
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 1ea0a558893..26e83e98f86 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -23,6 +23,7 @@
 #include "cudaq_internal/compiler/ArgumentConversion.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "runtime/cudaq/algorithms/py_utils.h"
+#include "runtime/cudaq/platform/PythonSignalCheck.h"
 #include "utils/LinkedLibraryHolder.h"
 #include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
@@ -888,6 +889,12 @@ appendResultToArgsVector(cudaq::OpaqueArguments &runtimeArgs, Type returnType,
 cudaq::KernelThunkResultType
 cudaq::clean_launch_module(const std::string &name, ModuleOp mod,
                            cudaq::OpaqueArguments &args) {
+  // Release the GIL for MLIR compilation and JIT. PyEval_SaveThread requires
+  // the GIL to be held, so guard with PyGILState_Check. Async paths invoke
+  // this from worker threads that never held the GIL.
+  std::optional<nanobind::gil_scoped_release> release;
+  if (PyGILState_Check())
+    release.emplace();
   auto kernelFunc = getKernelFuncOp(mod, name);
   Type retTy = cudaq::runtime::getReturnType(kernelFunc);
   // Append space for a result, as needed, to the vector of arguments.
@@ -922,6 +929,7 @@ nanobind::object cudaq::marshal_and_launch_module(const std::string &name,
   auto mod = unwrap(module);
   Type retTy = cudaq::runtime::getReturnType(kernelFunc);
   auto args = marshal_arguments_for_module_launch(mod, runtimeArgs, kernelFunc);
+
   [[maybe_unused]] auto resultPtr = clean_launch_module(name, mod, args);
 
   if (!retTy)
@@ -1007,6 +1015,7 @@ static MlirModule synthesizeKernel(nanobind::object kernel,
   SmallVector<StringRef> substRefs{substs.begin(), substs.end()};
 
   PassManager pm(context);
+  cudaq::addPythonSignalInstrumentation(pm);
   pm.addPass(cudaq::opt::createArgumentSynthesisPass(
       kernelRefs, substRefs, /*changeSemantics=*/false));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
@@ -1048,13 +1057,12 @@ static MlirModule synthesizeKernel(nanobind::object kernel,
     context->disableMultithreading();
   if (enablePrintMLIREachPass)
     pm.enableIRPrinting();
-  if (failed(pm.run(cloned))) {
-    engine.eraseHandler(handlerId);
-    throw std::runtime_error(
-        "failed to JIT compile the Quake representation\n" + error_msg);
-  }
+  bool pmFailed = failed(cudaq::runPassManagerReleasingGIL(pm, cloned));
   timingScope.stop();
   engine.eraseHandler(handlerId);
+  if (pmFailed)
+    throw std::runtime_error(
+        "failed to JIT compile the Quake representation\n" + error_msg);
   return wrap(cloned);
 }
 
@@ -1082,12 +1090,12 @@ static void executeMLIRPassManager(ModuleOp mod, PassManager &pm) {
   auto timingScope = tm.getRootScope(); // starts the timer
   pm.enableTiming(timingScope);         // do this right before pm.run
 
-  if (failed(pm.run(mod))) {
-    engine.eraseHandler(handlerId);
+  bool pmFailed = failed(cudaq::runPassManagerReleasingGIL(pm, mod));
+  timingScope.stop();
+  engine.eraseHandler(handlerId);
+  if (pmFailed)
     throw std::runtime_error(
         "failed to JIT compile the Quake representation\n" + error_msg);
-  }
-  timingScope.stop();
   engine.eraseHandler(handlerId);
 }
 
@@ -1098,6 +1106,7 @@ static ModuleOp cleanLowerToCodegenKernel(ModuleOp mod,
     // arguments will be resolved and marshaled at the kernel call site.
     auto *ctx = mod.getContext();
     PassManager pm(ctx);
+    cudaq::addPythonSignalInstrumentation(pm);
     std::string transport = getTransportLayer();
     cudaq::opt::addAOTPipelineConvertToQIR(pm, transport);
     executeMLIRPassManager(mod, pm);
@@ -1246,11 +1255,12 @@ void cudaq::bindAltLaunchKernel(nanobind::module_ &mod,
         auto m = unwrap(modA);
         auto context = m.getContext();
         PassManager pm(context);
+        cudaq::addPythonSignalInstrumentation(pm);
         pm.addNestedPass<func::FuncOp>(
             cudaq::opt::createPySynthCallableBlockArgs(
                 SmallVector<StringRef>(funcNames.begin(), funcNames.end()),
                 true));
-        if (failed(pm.run(m)))
+        if (failed(cudaq::runPassManagerReleasingGIL(pm, m)))
           throw std::runtime_error(
               "cudaq::jit failed to remove callable block arguments.");
 
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 7146dd6f70d..5e1c0da0d0e 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -26,6 +26,7 @@
 #include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include "cudaq_internal/compiler/JIT.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "runtime/cudaq/platform/PythonSignalCheck.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Export.h"
@@ -49,6 +50,7 @@ static void specializeKernel(const std::string &name, ModuleOp module,
                              bool isEntryPoint = true,
                              bool isFullySpecialized = true) {
   PassManager pm(module.getContext());
+  cudaq::addPythonSignalInstrumentation(pm);
   ArgumentConverter argCon(name, module);
   // Look up the kernel's type signature.
   argCon.gen(name, module, rawArgs);
@@ -92,8 +94,8 @@ static void specializeKernel(const std::string &name, ModuleOp module,
     module.getContext()->disableMultithreading();
     pm.enableIRPrinting();
   }
-  if (failed(pm.run(module)))
-    throw std::runtime_error("Could not successfully apply argument synth.");
+  if (failed(cudaq::runPassManagerReleasingGIL(pm, module)))
+    throw std::runtime_error("Pass pipeline failed.");
 }
 
 /// Replace %KEY% and %KEY:default% placeholders in a pipeline string with
@@ -159,14 +161,15 @@ static void runTargetPassPipeline(ModuleOp module) {
   if (enablePrintEachPass || disableThreading)
     ctx->disableMultithreading();
   PassManager pm(ctx);
+  cudaq::addPythonSignalInstrumentation(pm);
   if (enablePrintEachPass)
     pm.enableIRPrinting();
   std::string errMsg;
   llvm::raw_string_ostream errOS(errMsg);
   if (failed(parsePassPipeline(pipeline, pm, errOS)))
     throw std::runtime_error("Failed to parse target pipeline: " + errMsg);
-  if (failed(pm.run(module)))
-    throw std::runtime_error("Target pass pipeline failed.");
+  if (failed(cudaq::runPassManagerReleasingGIL(pm, module)))
+    throw std::runtime_error("Pass pipeline failed.");
 }
 
 /// Lowers \p module to LLVM code. The LLVM code will use "full QIR" as the
@@ -182,11 +185,12 @@ std::string cudaq::detail::lower_to_qir_llvm(const std::string &name,
   specializeKernel(name, module, args.getArgs());
   runTargetPassPipeline(module);
   PassManager pm(module.getContext());
+  cudaq::addPythonSignalInstrumentation(pm);
   cudaq::opt::addAggressiveInlining(pm);
   cudaq::opt::createTargetFinalizePipeline(pm);
   cudaq::opt::addAOTPipelineConvertToQIR(pm, format);
-  if (failed(pm.run(module)))
-    throw std::runtime_error("Conversion to " + format + " failed.");
+  if (failed(cudaq::runPassManagerReleasingGIL(pm, module)))
+    throw std::runtime_error("Pass pipeline failed.");
   if (failed(cudaq::verifier::checkQIRLLVMIRDialect(module, format)))
     throw std::runtime_error("QIR conformance failed.");
   llvm::LLVMContext llvmContext;
@@ -215,6 +219,7 @@ std::string cudaq::detail::lower_to_openqasm(const std::string &name,
   runTargetPassPipeline(module);
   auto *ctx = module.getContext();
   PassManager pm(ctx);
+  cudaq::addPythonSignalInstrumentation(pm);
   cudaq::opt::createTargetFinalizePipeline(pm);
   cudaq::opt::createPipelineTransformsForPythonToOpenQASM(pm);
   cudaq::opt::addPipelineTranslateToOpenQASM(pm);
@@ -224,8 +229,8 @@ std::string cudaq::detail::lower_to_openqasm(const std::string &name,
     ctx->disableMultithreading();
     pm.enableIRPrinting();
   }
-  if (failed(pm.run(module)))
-    throw std::runtime_error("Conversion to OpenQASM failed.");
+  if (failed(cudaq::runPassManagerReleasingGIL(pm, module)))
+    throw std::runtime_error("Pass pipeline failed.");
   std::string result;
   llvm::raw_string_ostream os(result);
   if (failed(cudaq::translateToOpenQASM(module, os)))
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index a181b7ee894..46595055736 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -260,7 +260,8 @@ void Compiler::applyPipeline(const std::string &pipeline,
     contextPtr->disableMultithreading();
   if (enablePrintMLIREachPass)
     pm.enableIRPrinting();
-  if (failed(pm.run(moduleOp)))
+  if (failed(cudaq_internal::compiler::runPassManager(pm,
+                                                      moduleOp.getOperation())))
     throw std::runtime_error("Remote rest platform Quake lowering failed.");
 }
 
@@ -341,7 +342,8 @@ Compiler::prepareModule(const std::string &kernelName, mlir::ModuleOp m_module,
       moduleOp.getContext()->disableMultithreading();
     if (enablePrintMLIREachPass)
       pm.enableIRPrinting();
-    if (failed(pm.run(moduleOp)))
+    if (failed(cudaq_internal::compiler::runPassManager(
+            pm, moduleOp.getOperation())))
       throw std::runtime_error("Could not successfully apply quake-synth.");
   }
 
@@ -511,7 +513,8 @@ cudaq::CompiledModule Compiler::runPassPipeline(
         tmpModuleOp.getContext()->disableMultithreading();
       if (enablePrintMLIREachPass)
         pm.enableIRPrinting();
-      if (failed(pm.run(tmpModuleOp)))
+      if (failed(cudaq_internal::compiler::runPassManager(
+              pm, tmpModuleOp.getOperation())))
         throw std::runtime_error("Could not apply measurements to ansatz.");
       // The full pass pipeline was run above, but the ansatz pass can
       // introduce gates that aren't supported by the backend, so we need to
diff --git a/runtime/internal/compiler/JIT.cpp b/runtime/internal/compiler/JIT.cpp
index 90b783cb69f..5499c09aa11 100644
--- a/runtime/internal/compiler/JIT.cpp
+++ b/runtime/internal/compiler/JIT.cpp
@@ -19,6 +19,7 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Verifier/QIRLLVMIRDialect.h"
 #include "cudaq/runtime/logger/logger.h"
+#include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
@@ -307,7 +308,7 @@ cudaq_internal::compiler::createJITEngine(ModuleOp &moduleOp,
     tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
     auto timingScope = tm.getRootScope(); // starts the timer
     pm.enableTiming(timingScope);         // do this right before pm.run
-    if (failed(pm.run(module))) {
+    if (failed(cudaq_internal::compiler::runPassManager(pm, module))) {
       engine.eraseHandler(handlerId);
       throw std::runtime_error("[createJITEngine] Lowering to QIR for "
                                "remote emulation failed.\n" +
diff --git a/runtime/internal/compiler/RuntimeCppMLIR.cpp b/runtime/internal/compiler/RuntimeCppMLIR.cpp
index ee7f1e62460..2e9cc5f924c 100644
--- a/runtime/internal/compiler/RuntimeCppMLIR.cpp
+++ b/runtime/internal/compiler/RuntimeCppMLIR.cpp
@@ -16,3 +16,9 @@ void cudaq_internal::compiler::initializeLangMLIR() {
   llvm::InitializeNativeTargetAsmPrinter();
   cudaq::registerAllPasses();
 }
+
+mlir::LogicalResult
+cudaq_internal::compiler::runPassManager(mlir::PassManager &pm,
+                                         mlir::Operation *op) {
+  return pm.run(op);
+}
diff --git a/runtime/internal/compiler/RuntimeMLIR.cpp b/runtime/internal/compiler/RuntimeMLIR.cpp
index a85b7203556..1d5e8c7e2f0 100644
--- a/runtime/internal/compiler/RuntimeMLIR.cpp
+++ b/runtime/internal/compiler/RuntimeMLIR.cpp
@@ -434,7 +434,7 @@ qirProfileTranslationFunction(const std::string &qirProfile, Operation *op,
   tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
   auto timingScope = tm.getRootScope(); // starts the timer
   pm.enableTiming(timingScope);         // do this right before pm.run
-  if (failed(pm.run(op)))
+  if (failed(cudaq_internal::compiler::runPassManager(pm, op)))
     return failure();
   if (auto mod = dyn_cast<ModuleOp>(op))
     if (failed(cudaq::verifier::checkQIRLLVMIRDialect(mod, profileName)))
@@ -622,7 +622,7 @@ static void registerToOpenQASMTranslation() {
         tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
         auto timingScope = tm.getRootScope(); // starts the timer
         pm.enableTiming(timingScope);         // do this right before pm.run
-        if (failed(pm.run(op)))
+        if (failed(cudaq_internal::compiler::runPassManager(pm, op)))
           throw std::runtime_error("code generation failed.");
         timingScope.stop();
         auto passed = cudaq::translateToOpenQASM(op, output);
@@ -653,7 +653,7 @@ static void registerToIQMJsonTranslation() {
         tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
         auto timingScope = tm.getRootScope(); // starts the timer
         pm.enableTiming(timingScope);         // do this right before pm.run
-        if (failed(pm.run(op)))
+        if (failed(cudaq_internal::compiler::runPassManager(pm, op)))
           throw std::runtime_error("code generation failed.");
         timingScope.stop();
         auto passed = cudaq::translateToIQMJson(op, output);
diff --git a/runtime/internal/compiler/RuntimePyMLIR.cpp b/runtime/internal/compiler/RuntimePyMLIR.cpp
index 1325a7274b0..1c988ebb608 100644
--- a/runtime/internal/compiler/RuntimePyMLIR.cpp
+++ b/runtime/internal/compiler/RuntimePyMLIR.cpp
@@ -10,3 +10,18 @@
 // Pass registration is done through the 'register_dialect' python call.
 // The native target initialization is built into the MLIR python extension.
 void cudaq_internal::compiler::initializeLangMLIR() {}
+
+// Forward-declare the Python-aware helper so this translation unit does not
+// pull in headers from python/. The symbol is defined in
+// python/runtime/cudaq/platform/PythonSignalCheck.cpp, which is linked into
+// the same Python extension.
+namespace cudaq {
+mlir::LogicalResult runPassManagerReleasingGIL(mlir::PassManager &pm,
+                                               mlir::Operation *op);
+}
+
+mlir::LogicalResult
+cudaq_internal::compiler::runPassManager(mlir::PassManager &pm,
+                                         mlir::Operation *op) {
+  return cudaq::runPassManagerReleasingGIL(pm, op);
+}
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h b/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
index 54db0cbaffe..6d0eeaa5e55 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/JIT.h
@@ -23,6 +23,7 @@ class LLJIT;
 
 namespace mlir {
 class ModuleOp;
+class PassManager;
 class Type;
 } // namespace mlir
 
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/RuntimeMLIR.h b/runtime/internal/compiler/include/cudaq_internal/compiler/RuntimeMLIR.h
index 44feb030a85..767e7130a13 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/RuntimeMLIR.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/RuntimeMLIR.h
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/mlir-translate/Translation.h"
 #include <functional>
 #include <memory>
@@ -17,6 +19,7 @@
 namespace mlir {
 class MLIRContext;
 class ModuleOp;
+class Operation;
 } // namespace mlir
 
 namespace cudaq_internal::compiler {
@@ -109,4 +112,9 @@ std::optional<std::string>
 getEntryPointName(mlir::OwningOpRef<mlir::ModuleOp> &module);
 
 void initializeLangMLIR();
+
+/// Run pm on op. Implementation is selected at link time by
+/// RuntimeCppMLIR.cpp / RuntimePyMLIR.cpp so the host language can wrap
+/// the call with any required interpreter-lock handling.
+mlir::LogicalResult runPassManager(mlir::PassManager &pm, mlir::Operation *op);
 } // namespace cudaq_internal::compiler

From 9ffeae606ad00b6d59c085213bb7557dd6b301e0 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Tue, 28 Apr 2026 13:03:28 +1000
Subject: [PATCH 69/85] [OpenQASM] Remove empty stubs from OpenQASM codegen
 (#4377)

Implements the feature request in
https://github.com/NVIDIA/cuda-quantum/issues/2220:

- Add a regression test for the case reported in that bug report.

- Add a test for legitimate empty kernels.


Resolved: https://github.com/NVIDIA/cuda-quantum/issues/2220

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp | 11 +++++
 test/Translate/openqasm2_adj_rotations.cpp    |  3 --
 test/Translate/openqasm2_empty_kernel.cpp     | 41 ++++++++++++++++
 test/Translate/openqasm2_ghz.cpp              | 48 +++++++++++++++++++
 test/Translate/openqasm2_loop.cpp             |  3 --
 test/Translate/openqasm2_simple.cpp           |  3 --
 test/Translate/openqasm2_vector.cpp           |  3 --
 7 files changed, 100 insertions(+), 12 deletions(-)
 create mode 100644 test/Translate/openqasm2_empty_kernel.cpp
 create mode 100644 test/Translate/openqasm2_ghz.cpp

diff --git a/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp b/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp
index 020c8f4e19f..9b60034fa15 100644
--- a/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp
+++ b/lib/Optimizer/CodeGen/TranslateToOpenQASM.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/Frontend/nvqpp/AttributeNames.h"
+#include "cudaq/Optimizer/Builder/RuntimeNames.h"
 #include "cudaq/Optimizer/CodeGen/Emitter.h"
 #include "cudaq/Optimizer/CodeGen/OpenQASMEmitter.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
@@ -206,6 +207,16 @@ static LogicalResult emitOperation(Emitter &emitter, func::FuncOp op) {
   if (op.isPrivate())
     return success();
 
+  // Skip C++ ABI wrapper stubs: functions that have an empty body and whose
+  // name does not carry the `__nvqpp__mlirgen__` kernel prefix. These are stubs
+  // to satisfy the classical linker but contain no quantum operations and are
+  // not relevant to OpenQASM. Legitimate empty kernels (e.g. an explicitly
+  // empty `__qpu__` helper), which have the prefix and are kept so that any
+  // call sites remain valid.
+  if (!op.isExternal() && op.front().without_terminator().empty() &&
+      !op.getName().starts_with(runtime::cudaqGenPrefixName))
+    return success();
+
   // In Quake's reference semantics form, kernels only return classical types.
   // Thus, we check whether the numbers of results is zero or not.
   if (op.getNumResults() > 0)
diff --git a/test/Translate/openqasm2_adj_rotations.cpp b/test/Translate/openqasm2_adj_rotations.cpp
index 24ae89dc954..30f79674d21 100644
--- a/test/Translate/openqasm2_adj_rotations.cpp
+++ b/test/Translate/openqasm2_adj_rotations.cpp
@@ -46,9 +46,6 @@ int main() {
 
 // CHECK:  include "qelib1.inc";
 
-// CHECK:  gate ZN6kernelclEv(param0)  {
-// CHECK:  }
-
 // CHECK: qreg var0[6];
 // CHECK: x var0[5];
 // CHECK: ch var0[4], var0[5];
diff --git a/test/Translate/openqasm2_empty_kernel.cpp b/test/Translate/openqasm2_empty_kernel.cpp
new file mode 100644
index 00000000000..3c97c29e2bc
--- /dev/null
+++ b/test/Translate/openqasm2_empty_kernel.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// An explicitly empty `__qpu__` helper kernel must still be emitted as a gate
+// definition so that call sites in the entry point remain valid.
+
+// RUN: cudaq-quake %s | cudaq-translate --convert-to=openqasm2 | FileCheck %s
+
+#include <cudaq.h>
+
+struct noop {
+  void operator()(cudaq::qubit &q) __qpu__ {}
+};
+
+struct caller {
+  void operator()() __qpu__ {
+    cudaq::qvector q(1);
+    noop{}(q[0]);
+    mz(q);
+  }
+};
+
+int main() { cudaq::sample(caller{}); }
+
+// CHECK:  // Code generated by NVIDIA's nvq++ compiler
+// CHECK:  OPENQASM 2.0;
+
+// CHECK:  include "qelib1.inc";
+
+// CHECK:  gate nvqpp__mlirgen__noop q0 {
+// CHECK:  }
+
+// CHECK:  qreg var0[1];
+// CHECK:  nvqpp__mlirgen__noop var0[0];
+// CHECK:  creg var2[1];
+// CHECK:  measure var0 -> var2[0];
diff --git a/test/Translate/openqasm2_ghz.cpp b/test/Translate/openqasm2_ghz.cpp
new file mode 100644
index 00000000000..064e056d6b8
--- /dev/null
+++ b/test/Translate/openqasm2_ghz.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Regression test for https://github.com/NVIDIA/cuda-quantum/issues/2220.
+// The C++ ABI wrapper for a templated struct kernel must not appear as an
+// empty gate definition in the OpenQASM 2 output.
+
+// clang-format off
+// RUN: cudaq-quake %s | cudaq-opt --pass-pipeline="builtin.module(canonicalize,lambda-lifting,apply-op-specialization,func.func(memtoreg{quantum=0}),cc-loop-normalize,cc-loop-unroll)" | cudaq-translate --convert-to=openqasm2 | FileCheck %s
+// clang-format on
+
+#include <cudaq.h>
+
+template <std::size_t N>
+struct ghz {
+  void operator()() __qpu__ {
+    cudaq::qvector q(N);
+    h(q[0]);
+    for (int i = 0; i < static_cast<int>(N) - 1; i++)
+      x<cudaq::ctrl>(q[i], q[i + 1]);
+    mz(q);
+  }
+};
+
+int main() {
+  auto counts = cudaq::sample(ghz<3>{});
+  counts.dump();
+}
+
+// CHECK:  // Code generated by NVIDIA's nvq++ compiler
+// CHECK:  OPENQASM 2.0;
+
+// CHECK:  include "qelib1.inc";
+
+// CHECK-NOT: gate {{.*}}ghz{{.*}} {
+// CHECK-NOT: }
+
+// CHECK:  qreg var0[3];
+// CHECK:  h var0[0];
+// CHECK:  cx var0[0], var0[1];
+// CHECK:  cx var0[1], var0[2];
+// CHECK:  creg var4[3];
+// CHECK:  measure var0 -> var4;
diff --git a/test/Translate/openqasm2_loop.cpp b/test/Translate/openqasm2_loop.cpp
index dd260cac988..5f01f63e182 100644
--- a/test/Translate/openqasm2_loop.cpp
+++ b/test/Translate/openqasm2_loop.cpp
@@ -43,9 +43,6 @@ int main() {
 
 // CHECK:   include "qelib1.inc";
 
-// CHECK:   gate ZN16crystal_5_kernelclEv(param0)  {
-// CHECK:   }
-
 // CHECK: qreg var0[5];
 // CHECK: x var0[0];
 // CHECK: cx var0[0], var0[1];
diff --git a/test/Translate/openqasm2_simple.cpp b/test/Translate/openqasm2_simple.cpp
index 1d98483d5a5..6480d10f758 100644
--- a/test/Translate/openqasm2_simple.cpp
+++ b/test/Translate/openqasm2_simple.cpp
@@ -30,9 +30,6 @@ int main() {
 
 // CHECK:  include "qelib1.inc";
 
-// CHECK:  gate ZN6kernelclEv(param0)  {
-// CHECK:  }
-
 // CHECK:  qreg var0[2];
 // CHECK:  h var0[0];
 // CHECK:  cx var0[0], var0[1];
diff --git a/test/Translate/openqasm2_vector.cpp b/test/Translate/openqasm2_vector.cpp
index 183cdbcb1a9..02ac2e05005 100644
--- a/test/Translate/openqasm2_vector.cpp
+++ b/test/Translate/openqasm2_vector.cpp
@@ -31,9 +31,6 @@ int main() {
 
 // CHECK:  include "qelib1.inc";
 
-// CHECK:  gate ZN6kernelclEv(param0)  {
-// CHECK:  }
-
 // CHECK:  qreg var0[2];
 // CHECK:  ry(0.000000e+00) var0[1];
 // CHECK:  ry(7.853982e-01) var0[0];

From 501cca49d0012701615d695cd552b297705beb65 Mon Sep 17 00:00:00 2001
From: Thomas Alexander <thomasalexander2718@gmail.com>
Date: Tue, 28 Apr 2026 12:30:37 -0300
Subject: [PATCH 70/85] Add runtime tracer with spdlog and Chrome backends  
 (#4389)

## Summary

- Introduces a process-wide `Tracer` that dispatches span begin/end
events to
pluggable backends.
- `SpdlogTraceBackend` preserves identical output with the existing
`ScopedTrace` implementation.
- `ChromeTraceBackend` captures events in memory and serializes as
Chrome
Trace Event Format JSON (viewable in Perfetto / speedscope).
- `TracePassInstrumentation` attached at every in-repo MLIR
`PassManager`
construction sites.
- Env-var flow: `CUDAQ_TRACE_FORMAT=chrome|spdlog` /
`CUDAQ_TRACE_PATH=<path>`
enables tracing at `initializeLogger` time with no code changes.

## Design notes

- `TraceBackend` inherits `std::enable_shared_from_this` so a follow-up
Python-bindings PR can hold backends as first-class objects with
independent
C++ / Python shared ownership.
- Fork safety: `ChromeTraceBackend` records `ownerPid` at construction
and
skips the destructor file write in forked children, avoiding
parent-output
clobber.
## Dependencies

Based on `feature/gil-release-compilation`. Precedes PR #4284
(`feature/tracer-python`) which stacks on this.

---------

Signed-off-by: Thomas Alexander <talexander@nvidia.com>
---
 python/extension/CMakeLists.txt               |   2 +
 .../runtime/cudaq/algorithms/py_translate.cpp |   3 +
 .../cudaq/platform/py_alt_launch_kernel.cpp   |   5 +
 runtime/common/BaseRestRemoteClient.h         |   5 +
 runtime/cudaq/builder/kernel_builder.cpp      |   4 +
 runtime/cudaq/platform/default/python/QPU.cpp |   5 +
 .../rest_server/helpers/RestRemoteServer.cpp  |   3 +
 .../cudaq/runtime/logger/chrome_tracer.h      |  65 +++++++
 runtime/include/cudaq/runtime/logger/logger.h | 150 +++------------
 .../cudaq/runtime/logger/spdlog_tracer.h      |  24 +++
 runtime/include/cudaq/runtime/logger/tracer.h | 105 +++++++++++
 runtime/internal/compiler/CMakeLists.txt      |   6 +
 runtime/internal/compiler/RuntimeCppMLIR.cpp  |   2 +
 runtime/internal/compiler/RuntimePyMLIR.cpp   |   2 +
 .../compiler/TracePassInstrumentation.cpp     |  41 ++++
 .../compiler/TracePassInstrumentation.h       |  31 +++
 runtime/logger/CMakeLists.txt                 |   6 +-
 runtime/logger/chrome_tracer.cpp              |  94 ++++++++++
 runtime/logger/logger.cpp                     |   3 +
 runtime/logger/spdlog_tracer.cpp              |  45 +++++
 runtime/logger/tracer.cpp                     | 176 ++++++++++++++++++
 unittests/CMakeLists.txt                      |   1 +
 unittests/logger/CMakeLists.txt               |  22 +++
 unittests/logger/ChromeTracerTester.cpp       | 123 ++++++++++++
 24 files changed, 794 insertions(+), 129 deletions(-)
 create mode 100644 runtime/include/cudaq/runtime/logger/chrome_tracer.h
 create mode 100644 runtime/include/cudaq/runtime/logger/spdlog_tracer.h
 create mode 100644 runtime/include/cudaq/runtime/logger/tracer.h
 create mode 100644 runtime/internal/compiler/TracePassInstrumentation.cpp
 create mode 100644 runtime/internal/compiler/include/cudaq_internal/compiler/TracePassInstrumentation.h
 create mode 100644 runtime/logger/chrome_tracer.cpp
 create mode 100644 runtime/logger/spdlog_tracer.cpp
 create mode 100644 runtime/logger/tracer.cpp
 create mode 100644 unittests/logger/CMakeLists.txt
 create mode 100644 unittests/logger/ChromeTracerTester.cpp

diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index 3d4352bbfae..9bb1a0023d8 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -136,6 +136,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../../runtime/internal/compiler/RuntimePyMLIR.cpp
     ../../runtime/internal/compiler/JIT.cpp
     ../../runtime/internal/compiler/Compiler.cpp
+    ../../runtime/internal/compiler/TracePassInstrumentation.cpp
 
   EMBED_CAPI_LINK_LIBS
    CUDAQuantumMLIRCAPI
@@ -155,6 +156,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
 # mlir::PassInstrumentation, so it must match LLVM's RTTI setting.
 set_source_files_properties(
     ../runtime/cudaq/platform/PythonSignalCheck.cpp
+    ../../runtime/internal/compiler/TracePassInstrumentation.cpp
     PROPERTIES COMPILE_FLAGS "-fno-rtti -fno-exceptions"
 )
 
diff --git a/python/runtime/cudaq/algorithms/py_translate.cpp b/python/runtime/cudaq/algorithms/py_translate.cpp
index 503cbc38cce..05eb776a30f 100644
--- a/python/runtime/cudaq/algorithms/py_translate.cpp
+++ b/python/runtime/cudaq/algorithms/py_translate.cpp
@@ -12,6 +12,7 @@
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/platform/default/python/QPU.h"
 #include "cudaq/runtime/logger/logger.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 #include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
@@ -77,6 +78,8 @@ void cudaq::bindPyTranslate(nanobind::module_ &mod) {
         const std::string format = "qir";
         auto mod = unwrap(module);
         PassManager pm(mod.getContext());
+        pm.addInstrumentation(
+            std::make_unique<cudaq::TracePassInstrumentation>());
         cudaq::opt::addAOTPipelineConvertToQIR(pm, format);
         if (failed(pm.run(mod)))
           throw std::runtime_error("Conversion to " + format + " failed.");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 26e83e98f86..dceb63d9286 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -22,6 +22,7 @@
 #include "cudaq/platform/qpu.h"
 #include "cudaq_internal/compiler/ArgumentConversion.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "runtime/cudaq/algorithms/py_utils.h"
 #include "runtime/cudaq/platform/PythonSignalCheck.h"
 #include "utils/LinkedLibraryHolder.h"
@@ -1016,6 +1017,7 @@ static MlirModule synthesizeKernel(nanobind::object kernel,
 
   PassManager pm(context);
   cudaq::addPythonSignalInstrumentation(pm);
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   pm.addPass(cudaq::opt::createArgumentSynthesisPass(
       kernelRefs, substRefs, /*changeSemantics=*/false));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
@@ -1107,6 +1109,7 @@ static ModuleOp cleanLowerToCodegenKernel(ModuleOp mod,
     auto *ctx = mod.getContext();
     PassManager pm(ctx);
     cudaq::addPythonSignalInstrumentation(pm);
+    pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
     std::string transport = getTransportLayer();
     cudaq::opt::addAOTPipelineConvertToQIR(pm, transport);
     executeMLIRPassManager(mod, pm);
@@ -1256,6 +1259,8 @@ void cudaq::bindAltLaunchKernel(nanobind::module_ &mod,
         auto context = m.getContext();
         PassManager pm(context);
         cudaq::addPythonSignalInstrumentation(pm);
+        pm.addInstrumentation(
+            std::make_unique<cudaq::TracePassInstrumentation>());
         pm.addNestedPass<func::FuncOp>(
             cudaq::opt::createPySynthCallableBlockArgs(
                 SmallVector<StringRef>(funcNames.begin(), funcNames.end()),
diff --git a/runtime/common/BaseRestRemoteClient.h b/runtime/common/BaseRestRemoteClient.h
index 605715d02f0..41490691fd7 100644
--- a/runtime/common/BaseRestRemoteClient.h
+++ b/runtime/common/BaseRestRemoteClient.h
@@ -25,6 +25,7 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq_internal/compiler/ArgumentConversion.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Module.h"
@@ -157,6 +158,8 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
     std::string passName;
     if (!rawArgs.empty() || args) {
       mlir::PassManager pm(&mlirContext);
+      pm.addInstrumentation(
+          std::make_unique<cudaq::TracePassInstrumentation>());
       if (!rawArgs.empty()) {
         CUDAQ_INFO("Run Argument Synth.\n");
         cudaq_internal::compiler::ArgumentConverter argCon(name, moduleOp);
@@ -216,6 +219,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
     // Run client-side passes. `clientPasses` is empty right now, but the code
     // below accommodates putting passes into it.
     mlir::PassManager pm(&mlirContext);
+    pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
     std::string errMsg;
     llvm::raw_string_ostream os(errMsg);
 
@@ -298,6 +302,7 @@ class BaseRemoteRestRuntimeClient : public RemoteRuntimeClient {
     }
 
     mlir::PassManager pm(ctx);
+    pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
     // For now, the server side expects full-QIR.
     opt::addAOTPipelineConvertToQIR(pm);
 
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index 474d3834150..f926078a44e 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -19,6 +19,7 @@
 #include "cudaq/platform/nvqpp_interface.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
@@ -938,6 +939,7 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
 
   {
     PassManager pm(context);
+    pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
     pm.addNestedPass<func::FuncOp>(cudaq::opt::createUnwindLowering());
     cudaq::opt::addAggressiveInlining(pm);
     pm.addPass(createCanonicalizerPass());
@@ -965,6 +967,7 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
     // rewrites before lowering to a raw CFG form. Loop unrolling depends on the
     // cc.loop op and GKE generates new code which may have cc.loop ops, etc.
     PassManager pm(context);
+    pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
     cudaq::opt::addLowerToCFG(pm);
     // We want quantum allocations to stay where they are if
     // we are simulating and have user-provided state vectors.
@@ -1125,6 +1128,7 @@ std::string to_quake(ImplicitLocOpBuilder &builder) {
 
   // Clean up the code for print out
   PassManager pm(clonedModule.getContext());
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   if (failed(pm.run(clonedModule)))
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 5e1c0da0d0e..177baffdc81 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -26,6 +26,7 @@
 #include "cudaq_internal/compiler/CompiledModuleHelper.h"
 #include "cudaq_internal/compiler/JIT.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "runtime/cudaq/platform/PythonSignalCheck.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/Pass/PassManager.h"
@@ -51,6 +52,7 @@ static void specializeKernel(const std::string &name, ModuleOp module,
                              bool isFullySpecialized = true) {
   PassManager pm(module.getContext());
   cudaq::addPythonSignalInstrumentation(pm);
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   ArgumentConverter argCon(name, module);
   // Look up the kernel's type signature.
   argCon.gen(name, module, rawArgs);
@@ -162,6 +164,7 @@ static void runTargetPassPipeline(ModuleOp module) {
     ctx->disableMultithreading();
   PassManager pm(ctx);
   cudaq::addPythonSignalInstrumentation(pm);
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   if (enablePrintEachPass)
     pm.enableIRPrinting();
   std::string errMsg;
@@ -186,6 +189,7 @@ std::string cudaq::detail::lower_to_qir_llvm(const std::string &name,
   runTargetPassPipeline(module);
   PassManager pm(module.getContext());
   cudaq::addPythonSignalInstrumentation(pm);
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   cudaq::opt::addAggressiveInlining(pm);
   cudaq::opt::createTargetFinalizePipeline(pm);
   cudaq::opt::addAOTPipelineConvertToQIR(pm, format);
@@ -220,6 +224,7 @@ std::string cudaq::detail::lower_to_openqasm(const std::string &name,
   auto *ctx = module.getContext();
   PassManager pm(ctx);
   cudaq::addPythonSignalInstrumentation(pm);
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   cudaq::opt::createTargetFinalizePipeline(pm);
   cudaq::opt::createPipelineTransformsForPythonToOpenQASM(pm);
   cudaq::opt::addPipelineTranslateToOpenQASM(pm);
diff --git a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
index be593851f71..2b3194b2528 100644
--- a/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
+++ b/runtime/cudaq/platform/default/rest_server/helpers/RestRemoteServer.cpp
@@ -22,6 +22,7 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq_internal/compiler/JIT.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "nvqir/CircuitSimulator.h"
 #include "server_impl/RestServer.h"
 #include "llvm/ADT/ScopeExit.h"
@@ -443,6 +444,8 @@ class RemoteRestRuntimeServer : public cudaq::RemoteRuntimeServer {
     auto ctx = module.getContext();
     {
       PassManager pm(ctx);
+      pm.addInstrumentation(
+          std::make_unique<cudaq::TracePassInstrumentation>());
       std::string errMsg;
       llvm::raw_string_ostream os(errMsg);
       const std::string pipeline =
diff --git a/runtime/include/cudaq/runtime/logger/chrome_tracer.h b/runtime/include/cudaq/runtime/logger/chrome_tracer.h
new file mode 100644
index 00000000000..9f8426a3871
--- /dev/null
+++ b/runtime/include/cudaq/runtime/logger/chrome_tracer.h
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/runtime/logger/tracer.h"
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace cudaq {
+
+// Captures Chrome Trace Event Format JSON in memory. If constructed with a
+// non-empty path, the destructor writes that JSON to the path. Events
+// accumulate in a member vector guarded by a mutex, so correctness does not
+// depend on thread-local destruction ordering at process exit.
+class ChromeTraceBackend : public TraceBackend {
+public:
+  explicit ChromeTraceBackend(std::string path = {});
+  ~ChromeTraceBackend() override;
+
+  void onBegin(const TraceEvent &e) override;
+  void onEnd(const TraceEvent &e, uint64_t durUs) override;
+
+  // Serialize the current event buffer as a Chrome Trace Event Format JSON
+  // string. Non-destructive; safe to call mid-run.
+  std::string toJson();
+
+  // Write the current JSON to `path`, or to the ctor path if `path` is empty.
+  // No-op if neither is provided. Non-destructive; the buffer stays intact.
+  void writeFile(std::optional<std::string> path = {});
+
+  // Drop all buffered events.
+  void clear();
+
+private:
+  struct Event {
+    std::string name;
+    std::string category;
+    std::string args;
+    uint64_t tsUs;
+    uint64_t durUs;
+    uint32_t tid;
+  };
+
+  std::string outputPath;
+  std::mutex mu;
+  std::vector<Event> events;
+  // pid at construction. If the backend's destructor runs in a forked
+  // child, we skip writing so we don't clobber the parent's output at
+  // the same path.
+  pid_t ownerPid;
+};
+
+} // namespace cudaq
diff --git a/runtime/include/cudaq/runtime/logger/logger.h b/runtime/include/cudaq/runtime/logger/logger.h
index 58d71df2ef1..85ee8254dec 100644
--- a/runtime/include/cudaq/runtime/logger/logger.h
+++ b/runtime/include/cudaq/runtime/logger/logger.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "cudaq/runtime/logger/cudaq_fmt.h"
+#include "cudaq/runtime/logger/tracer.h"
 
 namespace cudaq {
 
@@ -89,17 +90,6 @@ void log(const std::string_view message, Args &&...args) {
                    cudaq_fmt::format(message, args...));
 }
 
-/// @brief Context information (function, file, and line) of a caller
-struct TraceContext {
-  const char *funcName = nullptr;
-  const char *fileName = nullptr;
-  int lineNo = 0;
-
-  TraceContext(const char *func = __builtin_FUNCTION(),
-               const char *file = __builtin_FILE(), int line = __builtin_LINE())
-      : funcName(func), fileName(file), lineNo(line) {}
-};
-
 /// @brief This type is meant to provided quick tracing
 /// of function calls. Instantiate at the beginning
 /// of a function and when it goes out of scope at function
@@ -128,101 +118,29 @@ struct TraceContext {
 // [2022-12-15 18:54:39.347] [trace] foo executed in 2.572 ms.
 class ScopedTrace {
 private:
-  /// @brief Time when this ScopedTrace is created.
-  std::chrono::time_point<std::chrono::system_clock> startTime;
-
-  /// @brief The name of this trace, typically the function name
-  std::string traceName;
-
-  /// @brief Any arguments the user would also like to print
-  std::string argsMsg;
-
-  /// @brief Integer timing tag value (used to enable/disable this trace)
-  int tag = 0;
-
-  /// @brief Whether or not timing tag is enabled
-  bool tagFound = false;
-
-  /// @brief File, line, etc. of trace caller
-  TraceContext context;
-
-  thread_local static inline short int globalTraceStack = -1;
-
-  /// @brief Constructor with name only. This is private because you should
-  /// probably be using ScopedTraceWithContext() instead.
-  ScopedTrace(const std::string &name) {
-    if (details::should_log(details::LogLevel::trace)) {
-      startTime = std::chrono::system_clock::now();
-      traceName = name;
-      globalTraceStack++;
-    }
-  }
+  SpanHandle handle;
 
-  /// @brief Constructor, take and print user-specified critical arguments. This
-  /// is private because you should probably be using ScopedTraceWithContext()
-  /// instead.
   template <typename... Args>
-  ScopedTrace(const std::string &name, Args &&...args) {
-    if (details::should_log(details::LogLevel::trace)) {
-      startTime = std::chrono::system_clock::now();
-      traceName = name;
-      argsMsg = " (args = {{";
-      constexpr std::size_t nArgs = sizeof...(Args);
-      for (std::size_t i = 0; i < nArgs; i++) {
-        argsMsg += (i != nArgs - 1) ? "{}, " : "{}}})";
-      }
-      argsMsg = cudaq_fmt::format(argsMsg, args...);
-      globalTraceStack++;
-    }
-  }
-
-  /// @brief Constructor, take and print user-specified critical arguments. This
-  /// is private because you should probably be using ScopedTraceWithContext()
-  /// instead.
-  /// @param tag See Timing.h
-  /// @param name String to print
-  template <typename... Args>
-  ScopedTrace(const int tag, const std::string &name, Args &&...args)
-      : tag(tag) {
-    tagFound = cudaq::isTimingTagEnabled(tag);
-    if (tagFound || details::should_log(details::LogLevel::trace)) {
-      startTime = std::chrono::system_clock::now();
-      traceName = name;
+  static std::string formatArgsMsg(bool tagFound, Args &&...args) {
+    if constexpr (sizeof...(Args) == 0) {
+      (void)tagFound;
+      return {};
+    } else {
+      std::string argsMsg;
       if (tagFound) {
-        // This needs double double braces because it goes through
-        // fmt::format(fmt::runtime()) twice ... once in this function and once
-        // in the cudaq::log() in the destructor.
+        // Double-escape: cudaq::log() runs the result through fmt::format
+        // a second time, so literal braces must survive both passes.
         argsMsg = " (args = {{{{";
         constexpr std::size_t nArgs = sizeof...(Args);
-        for (std::size_t i = 0; i < nArgs; i++) {
+        for (std::size_t i = 0; i < nArgs; i++)
           argsMsg += (i != nArgs - 1) ? "{}, " : "{}}}}})";
-        }
       } else {
         argsMsg = " (args = {{";
         constexpr std::size_t nArgs = sizeof...(Args);
-        for (std::size_t i = 0; i < nArgs; i++) {
+        for (std::size_t i = 0; i < nArgs; i++)
           argsMsg += (i != nArgs - 1) ? "{}, " : "{}}})";
-        }
       }
-      argsMsg = cudaq_fmt::format(argsMsg, args...);
-      globalTraceStack++;
-    }
-  }
-
-  /// @brief The constructor with a timing tag. This is private because you
-  /// should probably be using ScopedTraceWithContext() instead.
-  /// @param tag See Timing.h
-  /// @param name String to print
-  ScopedTrace(const int tag, const std::string &name,
-              const char *funcName = __builtin_FUNCTION(),
-              const char *fileName = __builtin_FILE(),
-              int lineNo = __builtin_LINE())
-      : tag(tag), context(funcName, fileName, lineNo) {
-    tagFound = cudaq::isTimingTagEnabled(tag);
-    if (tagFound || details::should_log(details::LogLevel::trace)) {
-      startTime = std::chrono::system_clock::now();
-      traceName = name;
-      globalTraceStack++;
+      return cudaq_fmt::format(argsMsg, args...);
     }
   }
 
@@ -230,45 +148,21 @@ class ScopedTrace {
   /// @brief Public constructor with a context and a timing tag.
   template <typename... Args>
   ScopedTrace(TraceContext ctx, const int tag, const std::string &name,
-              Args &&...args)
-      : ScopedTrace(tag, name, args...) {
-    context = ctx;
+              Args &&...args) {
+    const bool tagFound = (tag != 0) && cudaq::isTimingTagEnabled(tag);
+    if (!tagFound && !details::should_log(details::LogLevel::trace) &&
+        !Tracer::instance().isCaptureEnabled())
+      return;
+    std::string argsMsg = formatArgsMsg(tagFound, std::forward<Args>(args)...);
+    handle = Tracer::instance().beginSpan(ctx, name, tag, argsMsg, "scope");
   }
 
   /// @brief Public constructor with a context and no timing tag.
   template <typename... Args>
   ScopedTrace(TraceContext ctx, const std::string &name, Args &&...args)
-      : ScopedTrace(name, args...) {
-    context = ctx;
-  }
+      : ScopedTrace(ctx, /*tag=*/0, name, std::forward<Args>(args)...) {}
 
-  /// The destructor, get the elapsed time and trace.
-  ~ScopedTrace() {
-    if (tagFound || details::should_log(details::LogLevel::trace)) {
-      auto duration = static_cast<double>(
-          std::chrono::duration_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now() - startTime)
-              .count() /
-          1000.0);
-      // If we're printing because the tag was found, then add that tag info
-      std::string tagStr = tagFound ? cudaq_fmt::format("[tag={}] ", tag) : "";
-      std::string sourceInfo =
-          context.fileName
-              ? cudaq_fmt::format("[{}:{}] ",
-                                  details::pathToFileName(context.fileName),
-                                  context.lineNo)
-              : "";
-      auto str = cudaq_fmt::format(
-          "{}{}{}{} executed in {} ms.{}",
-          globalTraceStack > 0 ? std::string(globalTraceStack, '-') + " " : "",
-          tagStr, sourceInfo, traceName, duration, argsMsg);
-      if (tagFound)
-        cudaq::log(str);
-      else
-        details::trace(str);
-      globalTraceStack--;
-    }
-  }
+  ~ScopedTrace() { Tracer::instance().endSpan(std::move(handle)); }
 };
 } // namespace cudaq
 
diff --git a/runtime/include/cudaq/runtime/logger/spdlog_tracer.h b/runtime/include/cudaq/runtime/logger/spdlog_tracer.h
new file mode 100644
index 00000000000..77e4fd3d8c6
--- /dev/null
+++ b/runtime/include/cudaq/runtime/logger/spdlog_tracer.h
@@ -0,0 +1,24 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/runtime/logger/tracer.h"
+
+namespace cudaq {
+
+class SpdlogTraceBackend : public TraceBackend {
+public:
+  SpdlogTraceBackend();
+  ~SpdlogTraceBackend() override;
+
+  void onBegin(const TraceEvent &) override;
+  void onEnd(const TraceEvent &e, uint64_t durUs) override;
+};
+
+} // namespace cudaq
diff --git a/runtime/include/cudaq/runtime/logger/tracer.h b/runtime/include/cudaq/runtime/logger/tracer.h
new file mode 100644
index 00000000000..0a21b8b37a0
--- /dev/null
+++ b/runtime/include/cudaq/runtime/logger/tracer.h
@@ -0,0 +1,105 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace cudaq {
+
+/// @brief Context information (function, file, and line) of a trace caller.
+struct TraceContext {
+  const char *funcName = nullptr;
+  const char *fileName = nullptr;
+  int lineNo = 0;
+
+  TraceContext(const char *func = __builtin_FUNCTION(),
+               const char *file = __builtin_FILE(), int line = __builtin_LINE())
+      : funcName(func), fileName(file), lineNo(line) {}
+};
+
+struct TraceEvent {
+  TraceContext ctx;
+  std::string_view name;
+  std::string_view category;
+  int tag;
+  std::string_view args;
+  uint64_t tsUs;
+  uint32_t tid;
+  int depth;
+};
+
+// Backends must be constructed via std::make_shared so that C++ and Python
+// can each hold independent shared_ptr references. The Python binding's
+// nanobind::init factories enforce this.
+class TraceBackend : public std::enable_shared_from_this<TraceBackend> {
+public:
+  virtual ~TraceBackend() = default;
+  virtual void onBegin(const TraceEvent &e) = 0;
+  virtual void onEnd(const TraceEvent &e, uint64_t durUs) = 0;
+};
+
+// Owns its string state so it is safe against dangling caller memory between
+// beginSpan and endSpan. Move-only to avoid accidental copies.
+struct SpanHandle {
+  SpanHandle() = default;
+  SpanHandle(const SpanHandle &) = delete;
+  SpanHandle &operator=(const SpanHandle &) = delete;
+  SpanHandle(SpanHandle &&) = default;
+  SpanHandle &operator=(SpanHandle &&) = default;
+
+  TraceContext ctx{};
+  std::string name;
+  std::string category;
+  std::string args;
+  uint64_t beginTsUs = 0;
+  uint32_t tid = 0;
+  int depth = 0;
+  int tag = 0;
+  bool active = false;
+};
+
+class Tracer {
+public:
+  static Tracer &instance();
+
+  // Callers must quiesce span activity before swapping the backend.
+  // In-flight beginSpan / endSpan calls dereference the raw backend
+  // pointer without synchronization.
+  void setBackend(std::shared_ptr<TraceBackend> backend);
+
+  // Returns the currently installed backend, or nullptr if none.
+  std::shared_ptr<TraceBackend> getBackend() const;
+
+  // When enabled, beginSpan admits spans regardless of log level or tags.
+  void setCaptureEnabled(bool enabled);
+  bool isCaptureEnabled() const;
+
+  SpanHandle beginSpan(const TraceContext &ctx, std::string_view name,
+                       int tag = 0, std::string_view args = {},
+                       std::string_view category = "scope");
+
+  void endSpan(SpanHandle handle);
+
+private:
+  Tracer();
+  ~Tracer();
+
+  Tracer(const Tracer &) = delete;
+  Tracer &operator=(const Tracer &) = delete;
+};
+
+// Called from initializeLogger in logger.cpp to apply CUDAQ_TRACE_FORMAT /
+// CUDAQ_TRACE_PATH at library-load time. Exits the process with a diagnostic
+// on invalid combinations.
+void configureTracerFromEnv();
+
+} // namespace cudaq
diff --git a/runtime/internal/compiler/CMakeLists.txt b/runtime/internal/compiler/CMakeLists.txt
index 5af9675271c..2de4cb1d705 100644
--- a/runtime/internal/compiler/CMakeLists.txt
+++ b/runtime/internal/compiler/CMakeLists.txt
@@ -28,6 +28,12 @@ add_library(cudaq-mlir-runtime
     RuntimeMLIR.cpp
     RuntimeCppMLIR.cpp
     LayoutInfo.cpp
+    TracePassInstrumentation.cpp
+)
+
+set_source_files_properties(
+    TracePassInstrumentation.cpp
+    PROPERTIES COMPILE_FLAGS "-fno-rtti -fno-exceptions"
 )
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-mlir-runtime)
 
diff --git a/runtime/internal/compiler/RuntimeCppMLIR.cpp b/runtime/internal/compiler/RuntimeCppMLIR.cpp
index 2e9cc5f924c..c1c2ab43604 100644
--- a/runtime/internal/compiler/RuntimeCppMLIR.cpp
+++ b/runtime/internal/compiler/RuntimeCppMLIR.cpp
@@ -8,6 +8,7 @@
 
 #include "cudaq/Optimizer/InitAllPasses.h"
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetSelect.h"
 
@@ -20,5 +21,6 @@ void cudaq_internal::compiler::initializeLangMLIR() {
 mlir::LogicalResult
 cudaq_internal::compiler::runPassManager(mlir::PassManager &pm,
                                          mlir::Operation *op) {
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   return pm.run(op);
 }
diff --git a/runtime/internal/compiler/RuntimePyMLIR.cpp b/runtime/internal/compiler/RuntimePyMLIR.cpp
index 1c988ebb608..e7052cd4aa3 100644
--- a/runtime/internal/compiler/RuntimePyMLIR.cpp
+++ b/runtime/internal/compiler/RuntimePyMLIR.cpp
@@ -6,6 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 #include "cudaq_internal/compiler/RuntimeMLIR.h"
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
 
 // Pass registration is done through the 'register_dialect' python call.
 // The native target initialization is built into the MLIR python extension.
@@ -23,5 +24,6 @@ mlir::LogicalResult runPassManagerReleasingGIL(mlir::PassManager &pm,
 mlir::LogicalResult
 cudaq_internal::compiler::runPassManager(mlir::PassManager &pm,
                                          mlir::Operation *op) {
+  pm.addInstrumentation(std::make_unique<cudaq::TracePassInstrumentation>());
   return cudaq::runPassManagerReleasingGIL(pm, op);
 }
diff --git a/runtime/internal/compiler/TracePassInstrumentation.cpp b/runtime/internal/compiler/TracePassInstrumentation.cpp
new file mode 100644
index 00000000000..b80f8c89e80
--- /dev/null
+++ b/runtime/internal/compiler/TracePassInstrumentation.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq_internal/compiler/TracePassInstrumentation.h"
+
+#include "mlir/Pass/Pass.h"
+
+#include <utility>
+
+namespace cudaq {
+
+TracePassInstrumentation::TracePassInstrumentation() {
+  // Reserve for typical pass nesting depth to avoid per-push allocations.
+  spanStack.reserve(8);
+}
+
+void TracePassInstrumentation::runBeforePass(mlir::Pass *pass,
+                                             mlir::Operation *) {
+  spanStack.push_back(Tracer::instance().beginSpan(
+      TraceContext{}, pass->getName().str(), /*tag=*/0, {}, "mlir_pass"));
+}
+
+void TracePassInstrumentation::runAfterPass(mlir::Pass *, mlir::Operation *) {
+  if (spanStack.empty())
+    return;
+  SpanHandle handle = std::move(spanStack.back());
+  spanStack.pop_back();
+  Tracer::instance().endSpan(std::move(handle));
+}
+
+void TracePassInstrumentation::runAfterPassFailed(mlir::Pass *pass,
+                                                  mlir::Operation *op) {
+  runAfterPass(pass, op);
+}
+
+} // namespace cudaq
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/TracePassInstrumentation.h b/runtime/internal/compiler/include/cudaq_internal/compiler/TracePassInstrumentation.h
new file mode 100644
index 00000000000..34e4966bb14
--- /dev/null
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/TracePassInstrumentation.h
@@ -0,0 +1,31 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/runtime/logger/tracer.h"
+
+#include "mlir/Pass/PassInstrumentation.h"
+
+#include <vector>
+
+namespace cudaq {
+
+class TracePassInstrumentation : public mlir::PassInstrumentation {
+public:
+  TracePassInstrumentation();
+
+  void runBeforePass(mlir::Pass *pass, mlir::Operation *op) override;
+  void runAfterPass(mlir::Pass *pass, mlir::Operation *op) override;
+  void runAfterPassFailed(mlir::Pass *pass, mlir::Operation *op) override;
+
+private:
+  std::vector<SpanHandle> spanStack;
+};
+
+} // namespace cudaq
diff --git a/runtime/logger/CMakeLists.txt b/runtime/logger/CMakeLists.txt
index 2bf30dc7e5f..78f56218415 100644
--- a/runtime/logger/CMakeLists.txt
+++ b/runtime/logger/CMakeLists.txt
@@ -9,6 +9,9 @@
 add_library(cudaq-logger
   SHARED
     logger.cpp
+    tracer.cpp
+    spdlog_tracer.cpp
+    chrome_tracer.cpp
 )
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-logger)
 
@@ -17,8 +20,9 @@ target_include_directories(cudaq-logger
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime/include>
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>
     $<INSTALL_INTERFACE:include>
-  PRIVATE 
+  PRIVATE
     fmt::fmt-header-only
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/tpls/json/include>
 )
 target_link_libraries(cudaq-logger
   PRIVATE
diff --git a/runtime/logger/chrome_tracer.cpp b/runtime/logger/chrome_tracer.cpp
new file mode 100644
index 00000000000..fe3997d4db2
--- /dev/null
+++ b/runtime/logger/chrome_tracer.cpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/runtime/logger/chrome_tracer.h"
+#include "cudaq/runtime/logger/logger.h"
+
+#include <nlohmann/json.hpp>
+
+#include <fstream>
+#include <unistd.h>
+#include <utility>
+
+namespace cudaq {
+
+ChromeTraceBackend::ChromeTraceBackend(std::string path)
+    : outputPath(std::move(path)), ownerPid(getpid()) {}
+
+ChromeTraceBackend::~ChromeTraceBackend() {
+  if (outputPath.empty())
+    return;
+  // After fork(), the child inherits a shared_ptr to this backend but must
+  // not run our destructor's file write. Skip if our pid changed.
+  if (getpid() != ownerPid)
+    return;
+  writeFile();
+}
+
+void ChromeTraceBackend::onBegin(const TraceEvent &) {}
+
+void ChromeTraceBackend::onEnd(const TraceEvent &e, uint64_t durUs) {
+  Event ev;
+  ev.name.assign(e.name);
+  ev.category.assign(e.category);
+  ev.args.assign(e.args);
+  ev.tsUs = e.tsUs;
+  ev.durUs = durUs;
+  ev.tid = e.tid;
+  std::lock_guard<std::mutex> lock(mu);
+  events.push_back(std::move(ev));
+}
+
+std::string ChromeTraceBackend::toJson() {
+  std::vector<Event> snapshot;
+  {
+    std::lock_guard<std::mutex> lock(mu);
+    snapshot = events;
+  }
+
+  const int pid = static_cast<int>(getpid());
+  nlohmann::json traceEvents = nlohmann::json::array();
+  for (const auto &e : snapshot) {
+    nlohmann::json event = {
+        {"name", e.name}, {"cat", e.category}, {"ph", "X"},    {"ts", e.tsUs},
+        {"dur", e.durUs}, {"pid", pid},        {"tid", e.tid},
+    };
+    if (!e.args.empty())
+      event["args"] = {{"detail", e.args}};
+    traceEvents.push_back(std::move(event));
+  }
+
+  nlohmann::json doc = {
+      {"displayTimeUnit", "ms"},
+      {"traceEvents", std::move(traceEvents)},
+  };
+  return doc.dump();
+}
+
+void ChromeTraceBackend::writeFile(std::optional<std::string> path) {
+  const std::string &target = path && !path->empty() ? *path : outputPath;
+  if (target.empty()) {
+    CUDAQ_WARN("Chrome trace backend has no output path; nothing written.");
+    return;
+  }
+
+  const std::string json = toJson();
+  std::ofstream os(target, std::ios::trunc);
+  if (!os.is_open()) {
+    CUDAQ_WARN("Chrome trace backend failed to open {} for writing.", target);
+    return;
+  }
+  os << json;
+}
+
+void ChromeTraceBackend::clear() {
+  std::lock_guard<std::mutex> lock(mu);
+  events.clear();
+}
+
+} // namespace cudaq
diff --git a/runtime/logger/logger.cpp b/runtime/logger/logger.cpp
index aee2f794398..ec134057226 100644
--- a/runtime/logger/logger.cpp
+++ b/runtime/logger/logger.cpp
@@ -9,6 +9,7 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "common/FmtCore.h"
 #include "common/Timing.h"
+#include "cudaq/runtime/logger/tracer.h"
 #include "fmt/args.h"
 #include <filesystem>
 #include <set>
@@ -90,6 +91,8 @@ __attribute__((constructor)) void initializeLogger() {
       }
     }
   }
+
+  configureTracerFromEnv();
 }
 
 namespace details {
diff --git a/runtime/logger/spdlog_tracer.cpp b/runtime/logger/spdlog_tracer.cpp
new file mode 100644
index 00000000000..be877f915c6
--- /dev/null
+++ b/runtime/logger/spdlog_tracer.cpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/runtime/logger/spdlog_tracer.h"
+#include "cudaq/runtime/logger/cudaq_fmt.h"
+#include "cudaq/runtime/logger/logger.h"
+
+#include <string>
+
+namespace cudaq {
+
+SpdlogTraceBackend::SpdlogTraceBackend() = default;
+SpdlogTraceBackend::~SpdlogTraceBackend() = default;
+
+void SpdlogTraceBackend::onBegin(const TraceEvent &) {}
+
+void SpdlogTraceBackend::onEnd(const TraceEvent &e, uint64_t durUs) {
+  const bool tagFound = (e.tag != 0) && cudaq::isTimingTagEnabled(e.tag);
+  if (!tagFound && !details::should_log(details::LogLevel::trace))
+    return;
+
+  double duration = static_cast<double>(durUs) / 1000.0;
+  std::string tagStr = tagFound ? cudaq_fmt::format("[tag={}] ", e.tag) : "";
+  std::string sourceInfo =
+      e.ctx.fileName
+          ? cudaq_fmt::format("[{}:{}] ",
+                              details::pathToFileName(e.ctx.fileName),
+                              e.ctx.lineNo)
+          : "";
+  auto str = cudaq_fmt::format(
+      "{}{}{}{} executed in {} ms.{}",
+      e.depth > 0 ? std::string(e.depth, '-') + " " : "", tagStr, sourceInfo,
+      std::string(e.name), duration, std::string(e.args));
+  if (tagFound)
+    cudaq::log(str);
+  else
+    details::trace(str);
+}
+
+} // namespace cudaq
diff --git a/runtime/logger/tracer.cpp b/runtime/logger/tracer.cpp
new file mode 100644
index 00000000000..cacc550fbc0
--- /dev/null
+++ b/runtime/logger/tracer.cpp
@@ -0,0 +1,176 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/runtime/logger/tracer.h"
+#include "cudaq/runtime/logger/chrome_tracer.h"
+#include "cudaq/runtime/logger/logger.h"
+#include "cudaq/runtime/logger/spdlog_tracer.h"
+
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <pthread.h>
+#include <string>
+#if defined(__linux__)
+#include <sys/syscall.h>
+#endif
+#include <unistd.h>
+#include <utility>
+
+namespace cudaq {
+
+namespace {
+
+// Per-thread nesting depth used by the spdlog backend for indentation.
+thread_local int tracerDepth = -1;
+
+// Single active backend. Not mutex-guarded. Callers must quiesce span
+// activity before swapping the backend.
+std::shared_ptr<TraceBackend> &backendSlot() {
+  static std::shared_ptr<TraceBackend> b;
+  return b;
+}
+
+std::atomic<bool> programmaticCapture{false};
+
+uint64_t nowMicroseconds() {
+  auto now = std::chrono::system_clock::now().time_since_epoch();
+  return static_cast<uint64_t>(
+      std::chrono::duration_cast<std::chrono::microseconds>(now).count());
+}
+
+uint32_t currentThreadId() {
+  // Cache per thread. gettid is a kernel round trip and we call this on
+  // every admitted span.
+  thread_local uint32_t cached = 0;
+  thread_local bool cacheInitialized = false;
+  if (!cacheInitialized) {
+#if defined(__linux__)
+    cached = static_cast<uint32_t>(syscall(SYS_gettid));
+#elif defined(__APPLE__)
+    uint64_t tid = 0;
+    pthread_threadid_np(nullptr, &tid);
+    cached = static_cast<uint32_t>(tid);
+#else
+    cached = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(pthread_self()));
+#endif
+    cacheInitialized = true;
+  }
+  return cached;
+}
+
+} // namespace
+
+Tracer::Tracer() = default;
+
+Tracer::~Tracer() = default;
+
+Tracer &Tracer::instance() {
+  static Tracer tracer;
+  return tracer;
+}
+
+void Tracer::setBackend(std::shared_ptr<TraceBackend> backend) {
+  backendSlot() = std::move(backend);
+}
+
+std::shared_ptr<TraceBackend> Tracer::getBackend() const {
+  return backendSlot();
+}
+
+void Tracer::setCaptureEnabled(bool enabled) {
+  programmaticCapture.store(enabled, std::memory_order_relaxed);
+}
+
+bool Tracer::isCaptureEnabled() const {
+  return programmaticCapture.load(std::memory_order_relaxed);
+}
+
+SpanHandle Tracer::beginSpan(const TraceContext &ctx, std::string_view name,
+                             int tag, std::string_view args,
+                             std::string_view category) {
+  const bool tagFound = (tag != 0) && cudaq::isTimingTagEnabled(tag);
+  const bool traceOn =
+      cudaq::details::should_log(cudaq::details::LogLevel::trace);
+  const bool progCapture = programmaticCapture.load(std::memory_order_relaxed);
+  if (!(tagFound || traceOn || progCapture))
+    return {};
+
+  auto *backend = backendSlot().get();
+  if (!backend)
+    return {};
+
+  SpanHandle handle;
+  handle.ctx = ctx;
+  handle.name.assign(name);
+  handle.category.assign(category);
+  handle.args.assign(args);
+  handle.tag = tag;
+  handle.tid = currentThreadId();
+  handle.beginTsUs = nowMicroseconds();
+  ++tracerDepth;
+  handle.depth = tracerDepth;
+  handle.active = true;
+
+  TraceEvent e{handle.ctx,  handle.name,      handle.category, handle.tag,
+               handle.args, handle.beginTsUs, handle.tid,      handle.depth};
+  backend->onBegin(e);
+  return handle;
+}
+
+void Tracer::endSpan(SpanHandle handle) {
+  if (!handle.active)
+    return;
+
+  auto *backend = backendSlot().get();
+  if (!backend) {
+    --tracerDepth;
+    return;
+  }
+
+  const uint64_t endTs = nowMicroseconds();
+  const uint64_t durUs =
+      endTs > handle.beginTsUs ? endTs - handle.beginTsUs : 0;
+
+  TraceEvent e{handle.ctx,  handle.name,      handle.category, handle.tag,
+               handle.args, handle.beginTsUs, handle.tid,      handle.depth};
+  backend->onEnd(e, durUs);
+
+  --tracerDepth;
+}
+
+void configureTracerFromEnv() {
+  const char *format = std::getenv("CUDAQ_TRACE_FORMAT");
+  if (!format || format[0] == '\0' || std::strcmp(format, "spdlog") == 0) {
+    Tracer::instance().setBackend(std::make_shared<SpdlogTraceBackend>());
+    return;
+  }
+
+  if (std::strcmp(format, "chrome") == 0) {
+    const char *path = std::getenv("CUDAQ_TRACE_PATH");
+    if (!path || path[0] == '\0') {
+      std::fprintf(
+          stderr,
+          "CUDAQ_TRACE_FORMAT=chrome requires CUDAQ_TRACE_PATH to be set\n");
+      std::exit(1);
+    }
+    Tracer::instance().setBackend(std::make_shared<ChromeTraceBackend>(path));
+    return;
+  }
+
+  std::fprintf(stderr,
+               "CUDAQ_TRACE_FORMAT=%s is not recognized "
+               "(expected 'spdlog' or 'chrome')\n",
+               format);
+  std::exit(1);
+}
+
+} // namespace cudaq
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 5144ce8bd57..01553216cf9 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -586,6 +586,7 @@ add_subdirectory(backends)
 add_subdirectory(Optimizer)
 add_subdirectory(output_record)
 add_subdirectory(target_config)
+add_subdirectory(logger)
 
 if (CUDAQ_ENABLE_PYTHON)
   if (NOT Python_FOUND)
diff --git a/unittests/logger/CMakeLists.txt b/unittests/logger/CMakeLists.txt
new file mode 100644
index 00000000000..f0821c4202b
--- /dev/null
+++ b/unittests/logger/CMakeLists.txt
@@ -0,0 +1,22 @@
+# ============================================================================ #
+# Copyright (c) 2026 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+add_executable(test_chrome_tracer ChromeTracerTester.cpp)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+  target_link_options(test_chrome_tracer PRIVATE -Wl,--no-as-needed)
+endif()
+
+target_link_libraries(test_chrome_tracer
+  PRIVATE
+  cudaq-logger
+  gtest_main)
+
+gtest_discover_tests(test_chrome_tracer
+  PROPERTIES ENVIRONMENT "CUDAQ_TIMING_TAGS=1"
+  DISCOVERY_TIMEOUT 120)
diff --git a/unittests/logger/ChromeTracerTester.cpp b/unittests/logger/ChromeTracerTester.cpp
new file mode 100644
index 00000000000..d7101d80403
--- /dev/null
+++ b/unittests/logger/ChromeTracerTester.cpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Run under CUDAQ_TIMING_TAGS=1 so the Tracer gate admits tag-1 spans without
+// touching the spdlog log level.
+
+#include "cudaq/runtime/logger/chrome_tracer.h"
+#include "cudaq/runtime/logger/tracer.h"
+
+#include <gtest/gtest.h>
+#include <nlohmann/json.hpp>
+
+#include <filesystem>
+#include <fstream>
+#include <memory>
+#include <utility>
+
+namespace {
+
+nlohmann::json loadTrace(const std::string &path) {
+  std::ifstream in(path);
+  return nlohmann::json::parse(in);
+}
+
+std::string tempTracePath(const std::string &suffix) {
+  return (std::filesystem::temp_directory_path() /
+          ("cudaq-chrome-tracer-" + suffix + ".json"))
+      .string();
+}
+
+} // namespace
+
+TEST(ChromeTracer, WritesChromeEventFormat) {
+  auto path = tempTracePath("format");
+  std::error_code ec;
+  std::filesystem::remove(path, ec);
+
+  cudaq::Tracer::instance().setBackend(
+      std::make_shared<cudaq::ChromeTraceBackend>(path));
+
+  // Include characters that require JSON escaping in the name so a clean
+  // parse here also confirms the emitter is escaping them correctly.
+  const std::string trickyName = "name with \"quotes\" and \\ backslash";
+  cudaq::TraceContext ctx{"chrome_tracer_test", __FILE__, __LINE__};
+  auto handle = cudaq::Tracer::instance().beginSpan(ctx, trickyName, 1,
+                                                    "detail_here", "scope");
+  cudaq::Tracer::instance().endSpan(std::move(handle));
+
+  cudaq::Tracer::instance().setBackend(nullptr);
+
+  ASSERT_TRUE(std::filesystem::exists(path));
+  nlohmann::json doc = loadTrace(path);
+  ASSERT_TRUE(doc.contains("traceEvents"));
+  EXPECT_EQ(doc.value("displayTimeUnit", ""), "ms");
+
+  const auto &events = doc["traceEvents"];
+  ASSERT_EQ(events.size(), 1u);
+  const auto &e = events[0];
+  EXPECT_EQ(e.value("name", ""), trickyName);
+  EXPECT_EQ(e.value("cat", ""), "scope");
+  EXPECT_EQ(e.value("ph", ""), "X");
+  EXPECT_TRUE(e.contains("ts"));
+  EXPECT_TRUE(e.contains("dur"));
+  EXPECT_TRUE(e.contains("pid"));
+  EXPECT_TRUE(e.contains("tid"));
+  ASSERT_TRUE(e.contains("args"));
+  EXPECT_EQ(e["args"].value("detail", ""), "detail_here");
+
+  std::filesystem::remove(path, ec);
+}
+
+TEST(ChromeTracer, InMemoryToJsonWritesNoFile) {
+  auto path = tempTracePath("inmemory");
+  std::error_code ec;
+  std::filesystem::remove(path, ec);
+
+  auto backend = std::make_shared<cudaq::ChromeTraceBackend>();
+  cudaq::Tracer::instance().setBackend(backend);
+
+  cudaq::TraceContext ctx{"chrome_tracer_test", __FILE__, __LINE__};
+  auto handle =
+      cudaq::Tracer::instance().beginSpan(ctx, "memspan", 1, "detail", "scope");
+  cudaq::Tracer::instance().endSpan(std::move(handle));
+
+  const std::string json = backend->toJson();
+  nlohmann::json doc = nlohmann::json::parse(json);
+  ASSERT_TRUE(doc.contains("traceEvents"));
+  ASSERT_EQ(doc["traceEvents"].size(), 1u);
+  EXPECT_EQ(doc["traceEvents"][0].value("name", ""), "memspan");
+
+  cudaq::Tracer::instance().setBackend(nullptr);
+  backend.reset();
+
+  EXPECT_FALSE(std::filesystem::exists(path));
+}
+
+TEST(ChromeTracer, OmitsArgsWhenEmpty) {
+  auto path = tempTracePath("noargs");
+  std::error_code ec;
+  std::filesystem::remove(path, ec);
+
+  cudaq::Tracer::instance().setBackend(
+      std::make_shared<cudaq::ChromeTraceBackend>(path));
+
+  cudaq::TraceContext ctx{"chrome_tracer_test", __FILE__, __LINE__};
+  auto handle =
+      cudaq::Tracer::instance().beginSpan(ctx, "bare", 1, "", "scope");
+  cudaq::Tracer::instance().endSpan(std::move(handle));
+
+  cudaq::Tracer::instance().setBackend(nullptr);
+
+  nlohmann::json doc = loadTrace(path);
+  const auto &events = doc["traceEvents"];
+  ASSERT_EQ(events.size(), 1u);
+  EXPECT_FALSE(events[0].contains("args"));
+
+  std::filesystem::remove(path, ec);
+}

From 1529df716aba3867675534f11d306be373e88675 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Tue, 28 Apr 2026 09:49:22 -0700
Subject: [PATCH 71/85] [NFC] Update a comment (#4402)

The comment makes no sense in the context in which it appears.

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
index 0d08c1416e3..bbf790a0c37 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
@@ -271,13 +271,12 @@ struct NullCableOpToCallsRewrite
     StringRef qirQubitArrayAllocate = cudaq::opt::QIRArrayQubitAllocateArray;
     Type arrayQubitTy = M::getArrayType(rewriter.getContext());
 
-    // AllocaOp could have a size operand, or the size could be compile time
-    // known and encoded in the veq return type.
+    // NullCableOp must have a constant size encoded in the `!quake.cable`
+    // return type.
     auto loc = nullcable.getLoc();
     quake::CableType type = nullcable.getType();
-    auto constantSize = type.getSize();
-    Value sizeOperand =
-        rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+    auto width = type.getSize();
+    Value sizeOperand = rewriter.create<arith::ConstantIntOp>(loc, width, 64);
 
     // Replace the NullCableOp with the QIR call.
     rewriter.replaceOpWithNewOp<func::CallOp>(

From 4bfcc645283257446443bbfbea2508b35c0434f1 Mon Sep 17 00:00:00 2001
From: Pradnya Khalate <148914294+khalatepradnya@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:23:32 -0700
Subject: [PATCH 72/85] [testing] Fix Infleqtion tests for state prep (#4323)

* Follow-up to PR #3824
* Fix the `assert_close` function.
* Manually tested
```
python3 -m pytest -v python/tests/backends/test_Infleqtion.py
=================================================================== test session starts ===================================================================
platform linux -- Python 3.12.3, pytest-8.3.0, pluggy-1.6.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /workspaces/cuda-quantum
configfile: pyproject.toml
plugins: anyio-4.13.0, xdist-3.8.0
collected 9 items

python/tests/backends/test_Infleqtion.py::test_simple_kernel PASSED                                                                                 [ 11%]
python/tests/backends/test_Infleqtion.py::test_all_gates PASSED                                                                                     [ 22%]
python/tests/backends/test_Infleqtion.py::test_multiple_qvector PASSED                                                                              [ 33%]
python/tests/backends/test_Infleqtion.py::test_multiple_measure PASSED                                                                              [ 44%]
python/tests/backends/test_Infleqtion.py::test_observe PASSED                                                                                       [ 55%]
python/tests/backends/test_Infleqtion.py::test_state_synthesis PASSED                                                                               [ 66%]
python/tests/backends/test_Infleqtion.py::test_state_preparation PASSED                                                                             [ 77%]
python/tests/backends/test_Infleqtion.py::test_state_preparation_builder PASSED                                                                     [ 88%]
python/tests/backends/test_Infleqtion.py::test_exp_pauli PASSED                                                                                     [100%]

=================================================================== 9 passed in 18.45s ====================================================================

```

---------

Signed-off-by: Pradnya Khalate <pkhalate@nvidia.com>
---
 python/tests/backends/test_Infleqtion.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/python/tests/backends/test_Infleqtion.py b/python/tests/backends/test_Infleqtion.py
index ea9e9427b63..c86a5d9c941 100644
--- a/python/tests/backends/test_Infleqtion.py
+++ b/python/tests/backends/test_Infleqtion.py
@@ -24,8 +24,8 @@ def set_up_target():
     cudaq.reset_target()
 
 
-def assert_close(got) -> bool:
-    return got < -1.5 and got > -1.9
+def assert_close(want, got, tolerance=1.0e-5) -> bool:
+    return abs(want - got) < tolerance
 
 
 def test_simple_kernel():
@@ -179,7 +179,7 @@ def kernel2(s: cudaq.State):
 
 
 def test_state_preparation():
-    shots = 10000
+    shots = 100
 
     @cudaq.kernel
     def kernel(vec: list[complex]):
@@ -187,23 +187,20 @@ def kernel(vec: list[complex]):
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
     counts = cudaq.sample(kernel, state, shots_count=shots)
-    assert assert_close(counts["00"], shots / 2, 2)
-    assert assert_close(counts["10"], shots / 2, 2)
-    assert assert_close(counts["01"], 0., 2)
-    assert assert_close(counts["11"], 0., 2)
+    counts.dump()
+    assert assert_close(shots / 2, counts["00"], shots / 10)
+    assert assert_close(shots / 2, counts["10"], shots / 10)
 
 
 def test_state_preparation_builder():
-    shots = 10000
+    shots = 100
     kernel, state = cudaq.make_kernel(list[complex])
     qubits = kernel.qalloc(state)
 
     state = [1. / np.sqrt(2.), 1. / np.sqrt(2.), 0., 0.]
     counts = cudaq.sample(kernel, state, shots_count=shots)
-    assert assert_close(counts["00"], shots / 2, 2)
-    assert assert_close(counts["10"], shots / 2, 2)
-    assert assert_close(counts["01"], 0., 2)
-    assert assert_close(counts["11"], 0., 2)
+    assert assert_close(shots / 2, counts["00"], shots / 10)
+    assert assert_close(shots / 2, counts["10"], shots / 10)
 
 
 def test_exp_pauli():

From 23c01f925d4feef0bd2ce14dc54b28cafa9f7567 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Wed, 29 Apr 2026 06:32:29 +1000
Subject: [PATCH 73/85] [Publishing] Update mgpu Gitlab SHA (#4394)

Update mgpu code to use new `measureSpinOp` signature and some
downstream test updates.

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 .github/workflows/config/gitlab_commits.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt
index e3e19dbe3f3..56e05f7f7ab 100644
--- a/.github/workflows/config/gitlab_commits.txt
+++ b/.github/workflows/config/gitlab_commits.txt
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: 16b82e64ab6f1d14ed7162a8d2580b632271a89f
+nvidia-mgpu-commit: 011e29b169836eea3a8eea05b65a5a58a297e1b5

From 733413f8c166071003e99a7e0d2f5def3276bbf8 Mon Sep 17 00:00:00 2001
From: Zeel Desai <72783325+zeel2104@users.noreply.github.com>
Date: Wed, 29 Apr 2026 00:11:46 -0400
Subject: [PATCH 74/85] Fix observe broadcast fallback on REST QPU targets
 (#4395)

## Summary

This change fixes `cudaq.observe()` broadcasting on REST-based QPU
targets when `ExecutionContext.getExpectationValue()` returns `None`.

REST backends such as OQC and Quantinuum do not always populate
`executionContext->expectationValue`. The non-broadcast `observe()` path
already handled that by reconstructing the expectation value from
sampled term results, but `__broadcastObserve()` passed the `None` value
directly into `ObserveResult`, which caused a crash.

This patch makes the broadcast path use the same fallback behavior as
the non-broadcast path.

## What changed

- Added a shared helper in `python/cudaq/runtime/observe.py` to resolve
the expectation value:
  - return `ctx.getExpectationValue()` when available
  - otherwise reconstruct it from the sampled term expectations
- Updated `__broadcastObserve()` to use that helper
- Updated the existing non-broadcast path to reuse the same helper
instead of duplicating the fallback logic
- Added backend regression tests for:
  - `python/tests/backends/test_OQC.py`
  - `python/tests/backends/test_Quantinuum_kernel.py`

## Why this fixes the issue

Previously, the broadcast path assumed the expectation value was always
present in the execution context. On REST targets that assumption is
false, so `ObserveResult(...)` received `None` and raised a `TypeError`.

With this change, broadcasted `observe()` calls now fall back to
computing the expectation value from the returned sample counts,
matching the behavior already used in the non-broadcast path.

## Testing

I added regression tests covering broadcasted `observe()` calls for OQC
and Quantinuum.

What I was able to verify locally:
- the runtime fix is present in `python/cudaq/runtime/observe.py`
- the new backend regression tests are present and selected by pytest

What I could not fully verify locally:
- end-to-end execution of the new tests in my WSL environment

Reason:
- local runs abort during kernel compilation / MLIR lowering before
`observe()` execution begins
- the crash occurs in `cudaq/kernel/ast_bridge.py` / `compile_to_mlir`
- because of that, the local environment does not reach the broadcast
observe path, so it does not validate the new fallback behavior
end-to-end

This appears to be unrelated to the `observe` broadcast fix itself,
since the abort happens before the `observe()` runtime path is
exercised.

## Local environment notes

During local setup I had to:
- build CUDA-Q from source in WSL
- build a custom LLVM/MLIR toolchain
- disable the Braket backend locally to avoid unrelated AWS SDK
dependency issues

Even after that, the backend tests still abort earlier during kernel
compilation in this environment.

Signed-off-by: Zeel <desaizeel2128@gmail.com>
Co-authored-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
---
 python/cudaq/runtime/observe.py               | 37 ++++++++++---------
 python/tests/backends/test_OQC.py             | 25 +++++++++++++
 .../tests/backends/test_Quantinuum_kernel.py  | 25 +++++++++++++
 3 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/python/cudaq/runtime/observe.py b/python/cudaq/runtime/observe.py
index 8df7f25b500..6ef59e8e205 100644
--- a/python/cudaq/runtime/observe.py
+++ b/python/cudaq/runtime/observe.py
@@ -44,13 +44,30 @@ def __broadcastObserve(kernel, spin_operator, *args, shots_count=0, qpu_id=0):
             kernel(*a)
         res = ctx.result
         results.append(
-            cudaq_runtime.ObserveResult(ctx.getExpectationValue(),
-                                        spin_operator, res))
+            cudaq_runtime.ObserveResult(
+                __resolveExpectationValue(ctx, spin_operator, res),
+                spin_operator, res))
     if has_vector_args:
         ctx.unset_jit_engine()
     return results
 
 
+def __resolveExpectationValue(ctx, spin_operator, sample_result):
+    exp_val = ctx.getExpectationValue()
+    if exp_val is not None:
+        return exp_val
+
+    total = 0.0
+    for term in spin_operator:
+        if term.is_identity():
+            total += term.evaluate_coefficient().real
+        else:
+            total += (sample_result.expectation(term.term_id) *
+                      term.evaluate_coefficient().real)
+
+    return total
+
+
 def observe(kernel,
             spin_operator,
             *args,
@@ -180,21 +197,7 @@ def __computeTermExpectation(term, observe_result):
             kernel(*args)
         res = ctx.result
 
-        expVal = ctx.getExpectationValue()
-        if expVal == None:
-            sum = 0.0
-
-            def computeExpVal(term):
-                nonlocal sum
-                if term.is_identity():
-                    sum += term.evaluate_coefficient().real
-                else:
-                    sum += res.expectation(
-                        term.term_id) * term.evaluate_coefficient().real
-
-            for term in localOp:
-                computeExpVal(term)
-            expVal = sum
+        expVal = __resolveExpectationValue(ctx, localOp, res)
 
         observeResult = cudaq_runtime.ObserveResult(expVal, localOp, res)
         if not isinstance(spin_operator, list):
diff --git a/python/tests/backends/test_OQC.py b/python/tests/backends/test_OQC.py
index 9cd27146e6a..cda98e5a364 100644
--- a/python/tests/backends/test_OQC.py
+++ b/python/tests/backends/test_OQC.py
@@ -168,6 +168,31 @@ def test_OQC_observe():
     assert assert_close(res.expectation())
 
 
+def test_OQC_observe_broadcast():
+    qubit_count = 5
+    sample_count = 4
+    shots_count = 10000
+    parameters = np.random.default_rng(13).uniform(low=0,
+                                                   high=1,
+                                                   size=(sample_count,
+                                                         qubit_count))
+
+    @cudaq.kernel
+    def kernel(qubit_count: int, parameters: List[float]):
+        qvector = cudaq.qvector(qubit_count)
+        for i in range(qubit_count - 1):
+            rx(parameters[i], qvector[i])
+
+    results = cudaq.observe(kernel,
+                            spin.z(0), [qubit_count] * sample_count,
+                            parameters,
+                            shots_count=shots_count)
+    expected = np.cos(parameters[:, 0])
+
+    assert len(results) == sample_count
+    assert np.allclose([r.expectation() for r in results], expected, atol=0.1)
+
+
 def test_OQC_state_synthesis():
 
     @cudaq.kernel
diff --git a/python/tests/backends/test_Quantinuum_kernel.py b/python/tests/backends/test_Quantinuum_kernel.py
index dc9e39118bf..b152805a2b5 100644
--- a/python/tests/backends/test_Quantinuum_kernel.py
+++ b/python/tests/backends/test_Quantinuum_kernel.py
@@ -118,6 +118,31 @@ def ansatz(theta: float):
     assert assert_close(res.expectation())
 
 
+def test_quantinuum_observe_broadcast():
+    qubit_count = 5
+    sample_count = 4
+    shots_count = 10000
+    parameters = np.random.default_rng(13).uniform(low=0,
+                                                   high=1,
+                                                   size=(sample_count,
+                                                         qubit_count))
+
+    @cudaq.kernel
+    def kernel(qubit_count: int, parameters: List[float]):
+        qvector = cudaq.qvector(qubit_count)
+        for i in range(qubit_count - 1):
+            rx(parameters[i], qvector[i])
+
+    results = cudaq.observe(kernel,
+                            spin.z(0), [qubit_count] * sample_count,
+                            parameters,
+                            shots_count=shots_count)
+    expected = np.cos(parameters[:, 0])
+
+    assert len(results) == sample_count
+    assert np.allclose([r.expectation() for r in results], expected, atol=0.1)
+
+
 def test_quantinuum_u3_decomposition():
 
     @cudaq.kernel

From a7178f07bd787d074e012191d84e60436a87db06 Mon Sep 17 00:00:00 2001
From: huaweil <93200147+huaweil-nv@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:21:55 +0800
Subject: [PATCH 75/85] Fix CuPy state initialization for strided device arrays
 and adjacent dynamics issues (#4327)

## Summary

This PR fixes `cudaq.State.from_data(...)` for strided CuPy device
arrays.

Before this change, CuPy arrays with explicit stride metadata could be
misinterpreted during state construction. In particular, transposed
views, Fortran-ordered arrays, and other non-contiguous CuPy arrays
could be read as if they were flat contiguous buffers, which silently
changed their logical layout.

This affected both state-vector-like inputs and 2D density matrix
inputs, with the 2D case being especially dangerous because the matrix
contents could be silently reordered.

## How to reproduce

A minimal reproducer is to pass a transposed or Fortran-ordered CuPy
array into `cudaq.State.from_data(...)`.

Example:

```python
import cupy as cp
import cudaq
import numpy as np
from cudaq.dynamics import Schedule
from cudaq.operators import spin

cudaq.set_target("dynamics")

base = cp.array([[1.0 + 0.0j, 2.0 + 0.0j],
                 [3.0 + 0.0j, 4.0 + 0.0j]], dtype=cp.complex128)

rho = base.T  # or cp.asfortranarray(base)
state = cudaq.State.from_data(rho)

result = cudaq.evolve(
    0.0 * spin.z(0),
    {0: 2},
    Schedule([0.0], ["t"]),
    state,
    observables=[],
    collapse_operators=[],
    store_intermediate_results=cudaq.IntermediateResultSave.NONE,
)

print(np.array(result.final_state()))
print(cp.asnumpy(rho))
```
Before this change, these two values could differ for strided CuPy
inputs even though they should represent the same logical matrix.

## Root Cause
The CuPy import path did not consistently preserve logical layout
information.

CuPy arrays expose device memory through __cuda_array_interface__,
including shape and stride metadata. However, the previous
implementation could reduce CuPy inputs to a raw device pointer plus
element count, which is only safe for contiguous layouts. For strided
arrays such as a.T or cp.asfortranarray(a), this loses the logical
indexing semantics and can cause the array to be interpreted using its
underlying flat memory layout instead of its intended logical values.

## What this PR changes
* Read CuPy stride metadata from __cuda_array_interface__
* Preserve safe fast paths for contiguous inputs
* Canonicalize CuPy arrays when needed before constructing the state
* Add regression coverage for:
   - strided 1D CuPy views
   - C-order 2D CuPy arrays
   - Fortran-order 2D CuPy arrays
   - transposed 2D CuPy views

## Why this matters
Users expect cudaq.State.from_data(cupy_array) to preserve the logical
values of the CuPy array, regardless of whether the array is contiguous,
transposed, or stored with non-default strides. This PR fixes cases
where that expectation was not met and prevents silent layout corruption
for GPU-backed inputs.


------ update in 4/24
 ## Adjacent fixes discovered during review

Verifying the fix under the post-nanobind merge surfaced two independent
correctness gaps on the dynamics target. They are committed separately
so
  each can be reviewed or reverted on its own.

### 1. Reject non-square 2D CuPy arrays at `from_data` time (commit
`0f762a09`)

A non-square 2D CuPy array on the dynamics target previously slipped
through
  `createStateFromPyBuffer` and was flattened into a 1D buffer inside
`TensorStateData`. The failure was deferred until `initialize_cudm()`
raised
a cryptic `Invalid hilbertSpaceDims for the state data` with no pointer
to
  the real cause. Now mirrors the host 2D validation path and rejects at
`from_data` time with `state.from_data 2D array (density matrix) input
must
  be square matrix data.`.

### 2. Propagate `isDensityMatrix` in `createFromSizeAndPtr` (commit
`21e4bdde`)

  PR #2853 ("Migrate Python dynamics solver implementation to pybind11",
May 2025) trimmed the `CuDensityMatState` constructor's `isDm` parameter
but did not add an equivalent assignment in `createFromSizeAndPtr`,
leaving
  the locally computed `isDm` dead for ~11 months. As a result,
`cudaq.State.from_data(cupy_2d)` on the dynamics target produced a state
whose `getTensor().extents` returned `[N*N]` instead of `[N, N]` and
whose
`np.array(state).shape` was `(N*N,)` instead of `(N, N)`. The
inconsistency
was masked because `evolve()` re-infers the shape during
`initialize_cudm()`,
  so no existing test inspected the state before `evolve`.

Restored the flag after construction. `dimension` already stores the
flat
  element count and `getTensor()`/`operator()` already `sqrt` it when
  `isDensityMatrix` is true, so no other field needs to change.

---------

Signed-off-by: huaweil <huaweil@nvidia.com>
Co-authored-by: Sachin Pisal <spisal@nvidia.com>
---
 python/runtime/cudaq/algorithms/py_state.cpp  | 328 ++++++++++--------
 python/tests/builder/test_cupy_integration.py |  48 +++
 python/tests/dynamics/test_evolve_dynamics.py |  82 +++++
 .../nvqir/cudensitymat/CuDensityMatState.cpp  |   9 +-
 4 files changed, 321 insertions(+), 146 deletions(-)

diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index d1099e692be..6b7f774aabc 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -15,6 +15,7 @@
 #include "runtime/cudaq/platform/py_alt_launch_kernel.h"
 #include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include <cstdint>
 #include <nanobind/ndarray.h>
 #include <nanobind/stl/complex.h>
 #include <nanobind/stl/function.h>
@@ -23,16 +24,146 @@
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/unique_ptr.h>
 #include <nanobind/stl/vector.h>
+#include <numeric>
 
 using namespace cudaq;
 
-// FIXME: This is using a thread unsafe global?
+namespace {
 
+// FIXME: This is using a thread unsafe global?
 /// If we have any implicit device-to-host data transfers we will store that
 /// data here and ensure it is deleted properly.
-static std::vector<std::unique_ptr<void, std::function<void(void *)>>>
+// Keep implicit host copies alive for Python buffer interop.
+std::vector<std::unique_ptr<void, std::function<void(void *)>>>
     hostDataFromDevice;
 
+// CuPy interop helpers.
+struct BufferInfo {
+  void *ptr = nullptr;
+  std::size_t itemsize = 0;
+  std::string format;
+  std::vector<std::size_t> shape;
+  std::vector<ssize_t> strides;
+  bool readonly = false;
+  std::size_t size = 0;
+};
+
+nanobind::dict getCupyArrayInterface(nanobind::handle cupyArray) {
+  if (!nanobind::hasattr(cupyArray, "__cuda_array_interface__"))
+    throw std::runtime_error("Buffer is not a CuPy array");
+
+  return nanobind::cast<nanobind::dict>(
+      nanobind::borrow<nanobind::object>(cupyArray).attr(
+          "__cuda_array_interface__"));
+}
+
+std::vector<ssize_t>
+getCContiguousStrides(const std::vector<std::size_t> &shape,
+                      std::size_t itemsize) {
+  std::vector<ssize_t> strides(shape.size(), itemsize);
+  if (shape.empty())
+    return strides;
+
+  for (int i = static_cast<int>(shape.size()) - 2; i >= 0; --i)
+    strides[i] = strides[i + 1] * static_cast<ssize_t>(shape[i + 1]);
+
+  return strides;
+}
+
+std::vector<ssize_t> getCupyArrayStrides(const nanobind::dict &cupyArrayInfo,
+                                         const std::vector<std::size_t> &shape,
+                                         std::size_t itemsize) {
+  auto stridesObj = cupyArrayInfo["strides"];
+  if (stridesObj.is_none())
+    return getCContiguousStrides(shape, itemsize);
+
+  auto stridesTuple = nanobind::cast<nanobind::tuple>(stridesObj);
+  std::vector<ssize_t> strides;
+  strides.reserve(stridesTuple.size());
+  for (auto stride : stridesTuple)
+    strides.push_back(nanobind::cast<ssize_t>(stride));
+
+  return strides;
+}
+
+std::pair<std::size_t, std::string>
+getCupyComplexTypeInfo(const std::string &typeStr) {
+  if (typeStr == "<c8")
+    return {sizeof(std::complex<float>), "Zf"};
+  if (typeStr == "<c16")
+    return {sizeof(std::complex<double>), "Zd"};
+
+  throw std::runtime_error("Unsupported typestr in CuPy array: " + typeStr +
+                           ". Supported types are: <c16 and <c8.");
+}
+
+BufferInfo getCupyBufferInfo(nanobind::object cupyArray) {
+  auto cupyArrayInfo = getCupyArrayInterface(cupyArray);
+  auto dataInfo = nanobind::cast<nanobind::tuple>(cupyArrayInfo["data"]);
+  auto shapeTuple = nanobind::cast<nanobind::tuple>(cupyArrayInfo["shape"]);
+  std::vector<std::size_t> shape;
+  shape.reserve(shapeTuple.size());
+  for (auto dim : shapeTuple)
+    shape.push_back(nanobind::cast<std::size_t>(dim));
+
+  const std::string typeStr =
+      nanobind::cast<std::string>(cupyArrayInfo["typestr"]);
+  auto [dataTypeSize, formatDescriptor] = getCupyComplexTypeInfo(typeStr);
+  auto strides = getCupyArrayStrides(cupyArrayInfo, shape, dataTypeSize);
+  auto numElements = std::accumulate(shape.begin(), shape.end(), std::size_t{1},
+                                     std::multiplies<std::size_t>());
+
+  BufferInfo info;
+  info.ptr =
+      reinterpret_cast<void *>(nanobind::cast<std::uintptr_t>(dataInfo[0]));
+  info.itemsize = dataTypeSize;
+  info.format = formatDescriptor;
+  info.shape = std::move(shape);
+  info.strides = std::move(strides);
+  info.readonly = nanobind::cast<bool>(dataInfo[1]);
+  info.size = numElements;
+  return info;
+}
+
+bool isCContiguous(const std::vector<std::size_t> &shape,
+                   const std::vector<ssize_t> &strides, std::size_t itemsize) {
+  // Treat inconsistent metadata as non-contiguous so callers fall back to
+  // canonicalization instead of taking an unsafe fast path.
+  if (shape.size() != strides.size())
+    return false;
+
+  if (shape.empty())
+    return true;
+
+  ssize_t expectedStride = static_cast<ssize_t>(itemsize);
+  for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
+    if (shape[i] > 1 && strides[i] != expectedStride)
+      return false;
+    expectedStride *= static_cast<ssize_t>(shape[i]);
+  }
+
+  return true;
+}
+
+bool shouldCanonicalizeCupyArray(const BufferInfo &info,
+                                 const std::string &targetName) {
+  if (info.shape.empty())
+    return false;
+
+  // Only 2D arrays for the dynamics target or non-contiguous 1D arrays
+  // need canonicalization.
+  bool needsCanon = (info.shape.size() == 1) ||
+                    (info.shape.size() == 2 && targetName == "dynamics");
+  return needsCanon && !isCContiguous(info.shape, info.strides, info.itemsize);
+}
+
+nanobind::object canonicalizeCupyArrayToNumpy(nanobind::handle cupyArray) {
+  return nanobind::module_::import_("cupy").attr("asnumpy")(
+      nanobind::borrow<nanobind::object>(cupyArray));
+}
+
+} // namespace
+
 static std::vector<int> bitStringToIntVec(const std::string &bitString) {
   // Check that this is a valid bit string.
   const bool isValidBitString =
@@ -205,82 +336,7 @@ state pyGetStateLibraryMode(nanobind::object kernel, nanobind::args args) {
   });
 }
 
-/// @brief Helper struct to hold buffer metadata, analogous to Python's
-/// buffer_info.
-struct BufferInfo {
-  void *ptr = nullptr;
-  std::size_t itemsize = 0;
-  std::string format;
-  std::size_t ndim = 0;
-  std::vector<std::size_t> shape;
-  std::vector<ssize_t> strides;
-  bool readonly = false;
-  std::size_t size = 0; // total number of elements
-};
-
-static BufferInfo getCupyBufferInfo(nanobind::object cupy_buffer) {
-  // Note: cupy 13.5+ arrays will bind (overload resolution) to a
-  // nanobind::object type. However, we cannot access the underlying buffer info
-  // via a
-  // `.request()` as it will throw unless that is managed memory. Here, we
-  // retrieve and construct BufferInfo from the CuPy array interface.
-
-  if (!nanobind::hasattr(cupy_buffer, "__cuda_array_interface__")) {
-    throw std::runtime_error("Buffer is not a CuPy array");
-  }
-
-  nanobind::dict cupy_array_info = nanobind::cast<nanobind::dict>(
-      cupy_buffer.attr("__cuda_array_interface__"));
-  // Ref: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
-  // example: {'shape': (2, 2), 'typestr': '<c16', 'descr': [('', '<c16')],
-  // 'stream': 1, 'version': 3, 'strides': None, 'data': (140222144708608,
-  // False)}
-  nanobind::tuple dataInfo =
-      nanobind::cast<nanobind::tuple>(cupy_array_info["data"]);
-  void *dataPtr = (void *)nanobind::cast<int64_t>(dataInfo[0]);
-  const bool readOnly = nanobind::cast<bool>(dataInfo[1]);
-  auto shapeTuple = nanobind::cast<nanobind::tuple>(cupy_array_info["shape"]);
-  std::vector<std::size_t> extents;
-  for (std::size_t i = 0; i < shapeTuple.size(); i++) {
-    extents.push_back(nanobind::cast<std::size_t>(shapeTuple[i]));
-  }
-  const std::string typeStr =
-      nanobind::cast<std::string>(cupy_array_info["typestr"]);
-  if (typeStr != "<c16" && typeStr != "<c8") {
-    throw std::runtime_error("Unsupported typestr in CuPy array: " + typeStr +
-                             ". Supported types are: <c16 and <c8.");
-  }
-
-  const bool isDoublePrecision = typeStr == "<c16";
-
-  std::size_t dataTypeSize = isDoublePrecision ? sizeof(std::complex<double>)
-                                               : sizeof(std::complex<float>);
-  std::string desc = isDoublePrecision ? "Zd" : "Zf";
-
-  std::vector<ssize_t> strides(extents.size(), dataTypeSize);
-  for (size_t i = 1; i < extents.size(); ++i)
-    strides[i] = strides[i - 1] * extents[i - 1];
-
-  std::size_t totalSize = 1;
-  for (auto e : extents)
-    totalSize *= e;
-
-  BufferInfo info;
-  info.ptr = dataPtr;
-  info.itemsize = dataTypeSize;
-  info.format = desc;
-  info.ndim = extents.size();
-  info.shape = extents;
-  info.strides = strides;
-  info.readonly = readOnly;
-  info.size = totalSize;
-  return info;
-}
-
-/// @brief Helper to get BufferInfo from a numpy array via Python buffer
-/// protocol.
 static BufferInfo getNumpyBufferInfo(nanobind::object numpy_array) {
-  nanobind::module_ np = nanobind::module_::import_("numpy");
   auto dtype = numpy_array.attr("dtype");
   std::string dtypeStr = nanobind::cast<std::string>(dtype.attr("name"));
 
@@ -296,7 +352,6 @@ static BufferInfo getNumpyBufferInfo(nanobind::object numpy_array) {
     info.itemsize = nanobind::cast<std::size_t>(dtype.attr("itemsize"));
   }
   auto shapeTuple = nanobind::cast<nanobind::tuple>(numpy_array.attr("shape"));
-  info.ndim = shapeTuple.size();
   info.size = 1;
   for (std::size_t i = 0; i < shapeTuple.size(); i++) {
     auto ext = nanobind::cast<std::size_t>(shapeTuple[i]);
@@ -337,43 +392,66 @@ static cudaq::state createStateFromPyBuffer(nanobind::object data,
         "`dtype=numpy.complex128` if simulation is FP64, or "
         "`dtype=cudaq.complex()` for precision-agnostic code.");
 
-  if (!isHostData || info.shape.size() == 1) {
+  if (!isHostData && shouldCanonicalizeCupyArray(info, holder.getTarget().name))
+    return createStateFromPyBuffer(canonicalizeCupyArrayToNumpy(data), holder);
+
+  if (!isHostData) {
+    if (holder.getTarget().name == "dynamics") {
+      if (info.shape.size() == 2 && info.shape[0] != info.shape[1])
+        throw std::runtime_error(
+            "state.from_data 2D array (density matrix) input must be "
+            "square matrix data.");
+      TensorStateData tensorData{
+          std::pair<const void *, std::vector<std::size_t>>{info.ptr,
+                                                            info.shape}};
+      return state::from_data(tensorData);
+    }
+
     if (info.format == "Zf")
       return state::from_data(std::make_pair(
           reinterpret_cast<std::complex<float> *>(info.ptr), info.size));
 
     return state::from_data(std::make_pair(
         reinterpret_cast<std::complex<double> *>(info.ptr), info.size));
-  } else { // 2D array
-    const std::size_t rows = info.shape[0];
-    const std::size_t cols = info.shape[1];
-    if (rows != cols)
-      throw std::runtime_error(
-          "state.from_data 2D array (density matrix) input must be "
-          "square matrix data.");
-    const bool isDoublePrecision = (info.format == "Zd");
-    const int64_t dataSize = isDoublePrecision ? sizeof(std::complex<double>)
-                                               : sizeof(std::complex<float>);
-    const bool rowMajor =
-        info.strides[1] ==
-        dataSize; // check row-major: second stride == element size
-    const cudaq::complex_matrix::order matOrder =
-        rowMajor ? cudaq::complex_matrix::order::row_major
-                 : cudaq::complex_matrix::order::column_major;
-    const cudaq::complex_matrix::Dimensions dim = {rows, cols};
-    if (isDoublePrecision)
-      return state::from_data(cudaq::complex_matrix(
-          std::vector<cudaq::complex_matrix::value_type>(
-              reinterpret_cast<std::complex<double> *>(info.ptr),
-              reinterpret_cast<std::complex<double> *>(info.ptr) + info.size),
-          dim, matOrder));
+  }
+
+  if (info.shape.size() == 1) {
+    if (info.format == "Zf")
+      return state::from_data(std::make_pair(
+          reinterpret_cast<std::complex<float> *>(info.ptr), info.size));
 
+    return state::from_data(std::make_pair(
+        reinterpret_cast<std::complex<double> *>(info.ptr), info.size));
+  }
+
+  const std::size_t rows = info.shape[0];
+  const std::size_t cols = info.shape[1];
+  if (rows != cols)
+    throw std::runtime_error(
+        "state.from_data 2D array (density matrix) input must be "
+        "square matrix data.");
+  const bool isDoublePrecision = (info.format == "Zd");
+  const int64_t dataSize = isDoublePrecision ? sizeof(std::complex<double>)
+                                             : sizeof(std::complex<float>);
+  const bool rowMajor =
+      info.strides[1] ==
+      dataSize; // check row-major: second stride == element size
+  const cudaq::complex_matrix::order matOrder =
+      rowMajor ? cudaq::complex_matrix::order::row_major
+               : cudaq::complex_matrix::order::column_major;
+  const cudaq::complex_matrix::Dimensions dim = {rows, cols};
+  if (isDoublePrecision)
     return state::from_data(cudaq::complex_matrix(
         std::vector<cudaq::complex_matrix::value_type>(
-            reinterpret_cast<std::complex<float> *>(info.ptr),
-            reinterpret_cast<std::complex<float> *>(info.ptr) + info.size),
+            reinterpret_cast<std::complex<double> *>(info.ptr),
+            reinterpret_cast<std::complex<double> *>(info.ptr) + info.size),
         dim, matOrder));
-  }
+
+  return state::from_data(cudaq::complex_matrix(
+      std::vector<cudaq::complex_matrix::value_type>(
+          reinterpret_cast<std::complex<float> *>(info.ptr),
+          reinterpret_cast<std::complex<float> *>(info.ptr) + info.size),
+      dim, matOrder));
 }
 
 /// @brief Bind the get_state cudaq function
@@ -645,45 +723,7 @@ void cudaq::bindPyState(nanobind::module_ &mod, LinkedLibraryHolder &holder) {
                   "please add to your cupy array creation "
                   "`dtype=cupy.complex64` if simulation is FP32 and "
                   "`dtype=cupy.complex128` if simulation if FP64.");
-
-            // Compute the number of elements in the array
-            std::vector<std::size_t> extents;
-            auto numElements = [&]() {
-              auto shape =
-                  nanobind::cast<nanobind::tuple>(opaqueData.attr("shape"));
-              std::size_t numElements = 1;
-              for (auto el : shape) {
-                numElements *= nanobind::cast<std::size_t>(el);
-                extents.emplace_back(nanobind::cast<std::size_t>(el));
-              }
-              return numElements;
-            }();
-
-            long ptr = nanobind::cast<long>(data.attr("ptr"));
-            if (holder.getTarget().name == "dynamics") {
-              // For dynamics, we need to send on the extents to distinguish
-              // state vector vs density matrix.
-              TensorStateData tensorData{
-                  std::pair<const void *, std::vector<std::size_t>>{
-                      reinterpret_cast<std::complex<double> *>(ptr), extents}};
-              return state::from_data(tensorData);
-            }
-
-            // Check that the target is GPU-based, i.e., can handle device
-            // pointer.
-            if (!holder.getTarget().config.GpuRequired)
-              throw std::runtime_error(fmt::format(
-                  "Current target '{}' does not support CuPy arrays.",
-                  holder.getTarget().name));
-
-            if (typeStr == "complex64")
-              return state::from_data(std::make_pair(
-                  reinterpret_cast<std::complex<float> *>(ptr), numElements));
-            else if (typeStr == "complex128")
-              return state::from_data(std::make_pair(
-                  reinterpret_cast<std::complex<double> *>(ptr), numElements));
-            else
-              throw std::runtime_error("invalid cupy element type " + typeStr);
+            return createStateFromPyBuffer(opaqueData, holder);
           },
           "Return a state from CuPy device array.")
       .def("is_on_gpu", &state::is_on_gpu,
diff --git a/python/tests/builder/test_cupy_integration.py b/python/tests/builder/test_cupy_integration.py
index 6d445e502b4..009d050651a 100644
--- a/python/tests/builder/test_cupy_integration.py
+++ b/python/tests/builder/test_cupy_integration.py
@@ -130,6 +130,54 @@ def test_cupy_to_state_without_dtype():
     cudaq.reset_target()
 
 
+def test_cupy_to_state_strided_1d():
+    cudaq.set_target('nvidia', option='fp64')
+    # Use distinct, non-symmetric values so a buggy "read first 2 contiguous
+    # elements from .data.ptr" path would produce [1+1j, 2+2j] while the
+    # correct strided view is [1+1j, 3+3j]. This catches the original bug.
+    base = cp.array([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j], dtype=cp.complex128)
+    strided = base[::2]
+    assert not strided.flags['C_CONTIGUOUS']
+    state_from_cupy = cudaq.State.from_data(strided)
+    np.testing.assert_allclose(np.array(state_from_cupy),
+                               cp.asnumpy(strided),
+                               atol=1e-12)
+    cudaq.reset_target()
+
+
+def test_cupy_to_state_ownership_semantics():
+    """Document (and lock in) the ownership contract exposed by `from_data`:
+
+    - Contiguous 1D CuPy arrays take the zero-copy GPU fast path, so the
+      resulting State observes later mutations of the source buffer.
+    - Non-contiguous arrays must first be canonicalized via cupy.asnumpy,
+      which round-trips through host memory and yields an independent copy.
+
+    Both behaviours are by design; this test guards against silent changes
+    that would surprise users holding on to the original CuPy array.
+    """
+    cudaq.set_target('nvidia', option='fp64')
+    try:
+        # Contiguous fast path — state aliases device memory.
+        contig = cp.array([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j], dtype=cp.complex128)
+        s_contig = cudaq.State.from_data(contig)
+        contig[0] = 999 + 0j
+        aliased = np.array(s_contig)
+        assert aliased[0] == 999 + 0j, (
+            "Contiguous CuPy path is expected to alias source memory.")
+
+        # Strided slow path — state is independent of the source.
+        base = cp.array([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j], dtype=cp.complex128)
+        strided = base[::2]
+        s_strided = cudaq.State.from_data(strided)
+        base[0] = 999 + 0j
+        independent = np.array(s_strided)
+        assert independent[0] == 1 + 1j, (
+            "Strided CuPy path must canonicalize to an independent copy.")
+    finally:
+        cudaq.reset_target()
+
+
 @pytest.mark.parametrize("target", ["qpp-cpu", "density-matrix-cpu"])
 def test_cupy_to_state_cpu_sim(target):
     cudaq.set_target(target)
diff --git a/python/tests/dynamics/test_evolve_dynamics.py b/python/tests/dynamics/test_evolve_dynamics.py
index c648ab28ed9..c2f13bd7a2b 100644
--- a/python/tests/dynamics/test_evolve_dynamics.py
+++ b/python/tests/dynamics/test_evolve_dynamics.py
@@ -217,6 +217,88 @@ def test_evolve_density_matrix_numpy_layout_cudm():
     )
 
 
+def test_evolve_density_matrix_cupy_strided_layout_cudm():
+    base = cp.array([[1.0 + 0.0j, 2.0 + 0.0j], [3.0 + 0.0j, 4.0 + 0.0j]],
+                    dtype=cp.complex128)
+    cases = [
+        ("c_order", base, cp.asnumpy(base)),  # Already C-contiguous, no copy
+        ("fortran_order", cp.asfortranarray(base), cp.asnumpy(base)),
+        ("transpose_view", base.T, cp.asnumpy(base.T)),
+    ]
+
+    for _, rho, expected in cases:
+        state = cudaq.State.from_data(rho)
+        evolution_result = cudaq.evolve(
+            0.0 * boson.number(0),
+            {0: 2},
+            Schedule([0.0], ["t"]),
+            state,
+            observables=[],
+            collapse_operators=[],
+            store_intermediate_results=cudaq.IntermediateResultSave.NONE,
+        )
+
+        final_arr = np.array(evolution_result.final_state())
+        np.testing.assert_allclose(final_arr, expected, atol=1e-12)
+
+
+def test_evolve_density_matrix_cupy_contiguous_no_regression_cudm():
+    """C-contiguous 2D CuPy array should go through the GPU path directly
+    without being copied back to host."""
+    rho = cp.array([[1.0 + 0.0j, 0.0j], [0.0j, 0.0j]], dtype=cp.complex128)
+    assert rho.flags["C_CONTIGUOUS"]
+    expected = cp.asnumpy(rho)
+
+    state = cudaq.State.from_data(rho)
+    evolution_result = cudaq.evolve(
+        0.0 * boson.number(0),
+        {0: 2},
+        Schedule([0.0], ["t"]),
+        state,
+        observables=[],
+        collapse_operators=[],
+        store_intermediate_results=cudaq.IntermediateResultSave.NONE,
+    )
+
+    final_arr = np.array(evolution_result.final_state())
+    np.testing.assert_allclose(final_arr, expected, atol=1e-12)
+
+
+@pytest.mark.parametrize("layout",
+                         ["c_order", "fortran_order", "transpose_view"])
+def test_from_data_cupy_2d_square_metadata_preserved_pre_evolve(layout):
+    """State metadata (extents, array shape) must be correct immediately
+    after `from_data`, before any `evolve()` call re-initializes the state.
+
+    Regression for the `isDensityMatrix` flag not being propagated through
+    `CuDensityMatState::createFromSizeAndPtr` (dropped in PR #2853).
+    """
+    base = cp.array([[1.0 + 0.2j, 0.3 + 0.0j], [0.3 + 0.0j, 0.4 + 0.5j]],
+                    dtype=cp.complex128)
+    if layout == "c_order":
+        rho = base
+    elif layout == "fortran_order":
+        rho = cp.asfortranarray(base)
+    else:
+        rho = base.T
+
+    state = cudaq.State.from_data(rho)
+    assert state.getTensor().extents == [2, 2]
+    arr = np.array(state)
+    assert arr.shape == (2, 2)
+    np.testing.assert_allclose(arr, cp.asnumpy(rho), atol=1e-12)
+
+
+def test_from_data_cupy_2d_non_square_rejected():
+    """Non-square 2D CuPy arrays on dynamics target must be rejected at
+    `from_data` time with the same error as the host 2D path, not deferred
+    to a cryptic failure inside `evolve()`."""
+    rho = cp.array([[1, 2, 3], [4, 5, 6]], dtype=cp.complex128)
+    assert rho.flags["C_CONTIGUOUS"]
+    with pytest.raises(RuntimeError, match="square matrix"):
+        cudaq.State.from_data(rho)
+
+
 def test_evolve_from_data_random_density_matrix_preserved_cudm():
     np.random.seed(42)
     N = 64
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
index 53469e13f8a..0c162b366f5 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
@@ -144,8 +144,13 @@ CuDensityMatState::createFromSizeAndPtr(std::size_t size, void *dataPtr,
   HANDLE_CUDA_ERROR(cudaMemcpy(devicePtr, dataPtr,
                                size * sizeof(std::complex<double>),
                                cudaMemcpyDefault));
-  // printf("Created CuDensityMatState ptr %p\n", devicePtr);
-  return std::make_unique<CuDensityMatState>(size, devicePtr);
+  auto result = std::make_unique<CuDensityMatState>(size, devicePtr);
+  // Propagate the density-matrix flag so `getTensor()` and `operator()` report
+  // the correct shape before `initialize_cudm()` runs. Dropped in PR #2853 when
+  // the ctor `isDm` parameter was removed; `dimension` already stores the flat
+  // element count, so setting the flag alone is sufficient.
+  result->isDensityMatrix = isDm;
+  return result;
 }
 
 // Return the tensor at the given index. Throws

From 33abdccb94d1fdfb7655083c0a55b9b6e3ffd260 Mon Sep 17 00:00:00 2001
From: Spencer Churchill <25377399+splch@users.noreply.github.com>
Date: Wed, 29 Apr 2026 01:37:26 -0700
Subject: [PATCH 76/85] [python] Add IonQ observe broadcast regression test
 (#4410)

## Summary

Follow-up to #4395 (which fixed `cudaq.observe` broadcast on REST QPU
targets). That PR added regression tests for OQC and Quantinuum but not
IonQ, even though #4363 explicitly listed IonQ as affected. IonQ uses
the same `BaseRemoteRESTQPU` path, so an analogous test in
`test_IonQ.py` closes the coverage gap.

The new test mirrors `test_OQC_observe_broadcast` /
`test_quantinuum_observe_broadcast` exactly: a 4-sample parameter sweep
through `cudaq.observe(...)` on `spin.z(0)`, with results compared to
the analytical `cos(theta)` answer.

## Verification

I reproduced the original `TypeError` against `target='ionq'` (both
`emulate=True` and the real cloud `qpu='simulator'`), applied the fix
from #4395, and confirmed the broadcast call now returns correct
expectation values within shot noise. The new test passes on the
post-#4395 main.

## Test plan

- [x] `yapf --style google` clean
- [x] Manually verified against the real IonQ cloud simulator (broadcast
`observe`, 3 parameter sets, 200 shots, results within 0.02 of
analytical `cos(theta)`)
- [ ] CI:
`python/tests/backends/test_IonQ.py::test_ionq_observe_broadcast`

Signed-off-by: Spencer Churchill <25377399+splch@users.noreply.github.com>
---
 python/tests/backends/test_IonQ.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/tests/backends/test_IonQ.py b/python/tests/backends/test_IonQ.py
index c6a922e37c9..b6997b825bc 100644
--- a/python/tests/backends/test_IonQ.py
+++ b/python/tests/backends/test_IonQ.py
@@ -144,6 +144,31 @@ def test_ionq_observe():
     assert assert_close(res.expectation())
 
 
+def test_ionq_observe_broadcast():
+    qubit_count = 5
+    sample_count = 4
+    shots_count = 10000
+    parameters = np.random.default_rng(13).uniform(low=0,
+                                                   high=1,
+                                                   size=(sample_count,
+                                                         qubit_count))
+
+    @cudaq.kernel
+    def kernel(qubit_count: int, parameters: List[float]):
+        qvector = cudaq.qvector(qubit_count)
+        for i in range(qubit_count - 1):
+            rx(parameters[i], qvector[i])
+
+    results = cudaq.observe(kernel,
+                            spin.z(0), [qubit_count] * sample_count,
+                            parameters,
+                            shots_count=shots_count)
+    expected = np.cos(parameters[:, 0])
+
+    assert len(results) == sample_count
+    assert np.allclose([r.expectation() for r in results], expected, atol=0.1)
+
+
 def test_ionq_u3_decomposition():
 
     @cudaq.kernel

From b81421b276db7b787c3c348c2e96d93c96fd30cc Mon Sep 17 00:00:00 2001
From: Pradnya Khalate <148914294+khalatepradnya@users.noreply.github.com>
Date: Wed, 29 Apr 2026 10:45:47 -0700
Subject: [PATCH 77/85] [cc] Measure handle type (#4403)

## Summary
Introduce `!cc.measure_handle` - the IR alias for the source-language
`cudaq::measure_handle` - and widen `quake.mz`/`mx`/`my` and
`quake.discriminate` ODS / verifiers to admit it alongside the existing
`!quake.measure` form. Pure IR vocabulary: no path yet produces or
consumes the new type. This is the prologue of a small stack; lowering
through `convert-to-qir-api` lands in a follow-up PR, frontend bindings
after that.

## Motivation
`cudaq::measure_handle` is a distinct source-language type from both raw
integers and the existing measurement token: integer-shaped at the bit
level, but identity-preserving for analyses that need to distinguish a
measurement event from arbitrary i64 traffic. Landing the IR vocabulary
first gives the QIR conversion PR and the frontend PRs a stable target
without forcing the ODS contracts to churn step by step.

## What Changed
- **New type** `!cc.measure_handle` in the CC dialect; i64 payload,
opaque to the IR. Registered with the CC dialect, lowered to `i64` in
the CC->LLVM type converter, and `cc.cast` admits no-op `i64 <->
!cc.measure_handle`.
- **ODS widening** on `quake.mz`/`mx`/`my` results and
`quake.discriminate` operand: now `!cc.measure_handle` or
`!cc.stdvec<!cc.measure_handle>` are admitted in addition to the prior
`!quake.measure` forms.
- **Verifier widening**: `verifyMeasurements` and
`DiscriminateOp::verify` accept the new shape; arity diagnostics mention
both spellings so users see why a scalar-typed result is rejected when
measuring a register.
- **Tests**: `test/Transforms/roundtrip-ops.qke` (passthrough + `i64 <->
!cc.measure_handle` `cc.cast` round-trip), `test/Transforms/invalid.qke`
(verifier negatives).

## Risks
No behavioral change in this PR: no path produces or consumes
`!cc.measure_handle` until the follow-up PR lands. Risk is bounded to
ODS coverage gaps that would surface in the consumer; the follow-up
wires up `--convert-to-qir-api` and tests it.

## Downstream Impact
- CUDA-QX: none.
- Public API: none.
- Stack: lowering through `--convert-to-qir-api` lands in a follow-up PR
built on this branch; C++/Python frontend bindings land after that.

---------

Signed-off-by: Pradnya Khalate <pkhalate@nvidia.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
---
 include/cudaq/Optimizer/Dialect/CC/CCTypes.td | 21 ++++++++
 .../cudaq/Optimizer/Dialect/Quake/QuakeOps.td | 29 +++++++----
 lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp     |  3 ++
 lib/Optimizer/Dialect/CC/CCOps.cpp            |  5 ++
 lib/Optimizer/Dialect/CC/CCTypes.cpp          |  2 +-
 lib/Optimizer/Dialect/Quake/QuakeOps.cpp      | 26 ++++++----
 test/Transforms/invalid.qke                   |  8 +++
 test/Transforms/roundtrip-ops.qke             | 51 +++++++++++++++++++
 8 files changed, 126 insertions(+), 19 deletions(-)

diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
index 5eb64b94970..15b533a6a8a 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.td
@@ -283,6 +283,27 @@ def cc_CharSpanType
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// MeasureHandleType - opaque, word-sized classical value
+//===----------------------------------------------------------------------===//
+
+def cc_MeasureHandleType : CCType<"MeasureHandle", "measure_handle"> {
+  let summary = "An opaque word-sized handle for a measurement event.";
+  let description = [{
+    The IR alias of the C++ `cudaq::measure_handle` class. Semantically an
+    opaque, word-sized classical value (an `i64` payload) whose only meaningful
+    consumer is `quake.discriminate`. The bridge emits this type so that
+    verification passes can reject entry-point kernels that leak handles across
+    the host-device boundary; a late lowering pass replaces it with `i64`
+    before QIR emission.
+
+    ```mlir
+      %mh = quake.mz %q name "mh" : (!quake.ref) -> !cc.measure_handle
+      %b  = quake.discriminate %mh : (!cc.measure_handle) -> i1
+    ```
+  }];
+}
+
 def IsSpanLikePred : CPred<"isa<cudaq::cc::SpanLikeType>($_self)">;
 def SpanningType : Type<IsSpanLikePred, "dynamic length type">;
 
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index a1f1ec77d5c..bfe252c497c 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -1065,8 +1065,17 @@ class Measurement<string mnemonic> : QuakeOp<mnemonic, [MeasurementInterface,
     Variadic<AnyQTargetType>:$targets,
     OptionalAttr<StrAttr>:$registerName
   );
+  // The classical result of a measurement is one of:
+  //   - !quake.measure                   (the legacy SSA result),
+  //   - !cc.measure_handle               (the cudaq::measure_handle alias),
+  //   - !cc.stdvec<!quake.measure>       (vector of legacy results),
+  //   - !cc.stdvec<!cc.measure_handle>   (vector of handles).
+  // The handle forms are emitted by the bridge for `cudaq::measure_handle`
+  // callers (overloads of `mz`/`mx`/`my`) and are lowered to `i64` by
+  // `--convert-to-qir-api`'s `TypeConverter`.
   let results = (outs
-    AnyTypeOf<[MeasureType, StdvecOf<[MeasureType]>]>:$measOut,
+    AnyTypeOf<[MeasureType, cc_MeasureHandleType,
+               StdvecOf<[MeasureType, cc_MeasureHandleType]>]>:$measOut,
     Variadic<WireType>:$wires
   );
 
@@ -1131,19 +1140,21 @@ def MzOp : Measurement<"mz"> {
 def quake_DiscriminateOp : QuakeOp<"discriminate", [Pure]> {
   let summary = "Converts a measurement to a classical integral value.";
   let description = [{
-    Quake's measurement operators return a value of type `!quake.measure`. The
-    discriminate operation converts a value of type measure to a classical
-    integral value. This value is typically an `i1` type, but might be `i2` for
-    qutrits, or even an `i8` for general qudits.
+    Quake's measurement operators return a value of type `!quake.measure` or,
+    for `cudaq::measure_handle` callers, `!cc.measure_handle`. The
+    discriminate operation converts either form to a classical integral value.
+    The result is typically an `i1`, but might be `i2` for qutrits, or `i8`
+    for general qudits.
 
     While a measurement of a wire changes/corrupts the state of the wire, the
-    model maintains that a `!quake.measure` value is non-volatile. Therefore,
-    multiple applications of discriminate on the same `!quake.measure` value
-    will yield the same result value for a given result type.
+    model maintains that the classical measurement value is non-volatile.
+    Multiple applications of discriminate on the same value will yield the
+    same result for a given result type.
   }];
 
   let arguments = (ins
-    AnyTypeOf<[MeasureType, StdvecOf<[MeasureType]>]>:$measurement
+    AnyTypeOf<[MeasureType, cc_MeasureHandleType,
+               StdvecOf<[MeasureType, cc_MeasureHandleType]>]>:$measurement
   );
   let results = (outs
     AnyTypeOf<[AnySignlessInteger, StdvecOf<[AnySignlessInteger]>]>
diff --git a/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp b/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
index 8e65cbb55a4..9dc2b679ea3 100644
--- a/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
@@ -45,6 +45,9 @@ void cudaq::opt::populateCCTypeConversions(LLVMTypeConverter *converter) {
   converter->addConversion([](cc::IndirectCallableType type) {
     return IntegerType::get(type.getContext(), 64);
   });
+  converter->addConversion([](cc::MeasureHandleType type) {
+    return IntegerType::get(type.getContext(), 64);
+  });
   converter->addConversion([](cc::CallableType type) {
     return lambdaAsPairOfPointers(type.getContext());
   });
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index 04172cf6bba..f6def3c59e0 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -377,6 +377,11 @@ LogicalResult cudaq::cc::CastOp::verify() {
     } else if (isa<IntegerType>(inTy) && isa<cc::IndirectCallableType>(outTy)) {
       // ok: nop
       // the indirect callable value is an integer key on the device side.
+    } else if (isa<IntegerType>(inTy) && isa<cc::MeasureHandleType>(outTy)) {
+      // ok: nop
+      // the measure handle is an opaque i64 payload.
+    } else if (isa<cc::MeasureHandleType>(inTy) && isa<IntegerType>(outTy)) {
+      // ok: nop
     } else if (isa<IntegerType>(inTy) && isa<cc::PointerType>(outTy)) {
       // ok: inttoptr
     } else if (isa<cc::PointerType>(inTy) && isa<IntegerType>(outTy)) {
diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp
index 5ba2eea6fca..75be57ad612 100644
--- a/lib/Optimizer/Dialect/CC/CCTypes.cpp
+++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp
@@ -231,7 +231,7 @@ bool isDynamicallySizedType(Type ty) {
 
 void CCDialect::registerTypes() {
   addTypes<ArrayType, CallableType, CharspanType, IndirectCallableType,
-           PointerType, StdvecType, StructType>();
+           MeasureHandleType, PointerType, StdvecType, StructType>();
 }
 
 } // namespace cudaq::cc
diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index ac459be3e4e..bc5a2865773 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -828,13 +828,19 @@ LogicalResult verifyMeasurements(MEAS op, TypeRange targetsType,
       targetsType.size() > 1 ||
       (targetsType.size() == 1 && isa<quake::VeqType>(targetsType[0]));
   if (mustBeStdvec) {
-    if (!isa<cudaq::cc::StdvecType>(op.getMeasOut().getType()))
-      return op.emitOpError("must return `!cc.stdvec<!quake.measure>`, when "
-                            "measuring a qreg, a series of qubits, or both");
+    auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(op.getMeasOut().getType());
+    if (!stdvecTy || !isa<quake::MeasureType, cudaq::cc::MeasureHandleType>(
+                         stdvecTy.getElementType()))
+      return op.emitOpError(
+          "must return `!cc.stdvec<!quake.measure>` or "
+          "`!cc.stdvec<!cc.measure_handle>` when measuring a qvector, a "
+          "series of qubits, or both");
   } else {
-    if (!isa<quake::MeasureType>(op.getMeasOut().getType()))
+    if (!isa<quake::MeasureType, cudaq::cc::MeasureHandleType>(
+            op.getMeasOut().getType()))
       return op->emitOpError(
-          "must return `!quake.measure` when measuring exactly one qubit");
+          "must return `!quake.measure` or `!cc.measure_handle` when "
+          "measuring exactly one qubit");
   }
   if (op.getRegisterName())
     if (op.getRegisterName()->empty())
@@ -865,11 +871,13 @@ LogicalResult quake::DiscriminateOp::verify() {
   if (isa<cudaq::cc::StdvecType>(getMeasurement().getType())) {
     auto stdvecTy = dyn_cast<cudaq::cc::StdvecType>(getResult().getType());
     if (!stdvecTy || !isa<IntegerType>(stdvecTy.getElementType()))
-      return emitOpError("must return a !cc.stdvec<integral> type, when "
-                         "discriminating a qreg, a series of qubits, or both");
+      return emitOpError(
+          "must return a !cc.stdvec<integral> type, when discriminating a "
+          "qvector, a series of qubits, or both");
   } else {
-    auto measTy = isa<quake::MeasureType>(getMeasurement().getType());
-    if (!measTy || !isa<IntegerType>(getResult().getType()))
+    if (!isa<quake::MeasureType, cudaq::cc::MeasureHandleType>(
+            getMeasurement().getType()) ||
+        !isa<IntegerType>(getResult().getType()))
       return emitOpError(
           "must return integral type when discriminating exactly one qubit");
   }
diff --git a/test/Transforms/invalid.qke b/test/Transforms/invalid.qke
index 70267203f83..d5bb36707f1 100644
--- a/test/Transforms/invalid.qke
+++ b/test/Transforms/invalid.qke
@@ -226,3 +226,11 @@ func.func private @wonk(!quake.veq<18>, i32) -> f64
 
 // expected-error@+1 {{cannot classically allocate quake abstract type}}
 %0 = cc.alloca !quake.measure
+
+// -----
+
+func.func @mz_handle_register_scalar_result(%v: !quake.veq<?>) {
+  // expected-error@+1 {{must return `!cc.stdvec<!quake.measure>` or `!cc.stdvec<!cc.measure_handle>`}}
+  %m = quake.mz %v : (!quake.veq<?>) -> !cc.measure_handle
+  return
+}
diff --git a/test/Transforms/roundtrip-ops.qke b/test/Transforms/roundtrip-ops.qke
index 89a7bbbfab7..0455a65a2a4 100644
--- a/test/Transforms/roundtrip-ops.qke
+++ b/test/Transforms/roundtrip-ops.qke
@@ -975,3 +975,54 @@ func.func @integrated_device() {
 // CHECK:           %[[VAL_14:.*]] = cc.device_call @integrated_device_callback<%[[VAL_2]], %[[VAL_10]], %[[VAL_11]] * %[[VAL_3]], %[[VAL_12]], %[[VAL_13]]> on %[[VAL_5]](%[[VAL_0]]) : (i64, i64, i64, i64, i64, i64, i64, i64) -> i64
 // CHECK:           return
 // CHECK:         }
+
+
+func.func @measure_handle_passthrough(%h : !cc.measure_handle) -> !cc.measure_handle {
+  return %h : !cc.measure_handle
+}
+
+// CHECK-LABEL: func.func @measure_handle_passthrough(
+// CHECK-SAME:    %[[VAL_0:.*]]: !cc.measure_handle) -> !cc.measure_handle {
+// CHECK:         return %[[VAL_0]] : !cc.measure_handle
+// CHECK:       }
+
+func.func @measure_handle_cast_roundtrip(%i : i64) -> i64 {
+  %h = cc.cast %i : (i64) -> !cc.measure_handle
+  %r = cc.cast %h : (!cc.measure_handle) -> i64
+  return %r : i64
+}
+
+// CHECK-LABEL: func.func @measure_handle_cast_roundtrip(
+// CHECK-SAME:    %[[VAL_0:.*]]: i64) -> i64 {
+// CHECK:         %[[VAL_1:.*]] = cc.cast %[[VAL_0]] : (i64) -> !cc.measure_handle
+// CHECK:         %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.measure_handle) -> i64
+// CHECK:         return %[[VAL_2]] : i64
+// CHECK:       }
+
+
+func.func @measure_handle_mz_one(%q : !quake.ref) -> i1 {
+  %m = quake.mz %q name "h" : (!quake.ref) -> !cc.measure_handle
+  %b = quake.discriminate %m : (!cc.measure_handle) -> i1
+  return %b : i1
+}
+
+// CHECK-LABEL: func.func @measure_handle_mz_one(
+// CHECK-SAME:    %[[VAL_0:.*]]: !quake.ref) -> i1 {
+// CHECK:         %[[VAL_1:.*]] = quake.mz %[[VAL_0]] name "h" : (!quake.ref) -> !cc.measure_handle
+// CHECK:         %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!cc.measure_handle) -> i1
+// CHECK:         return %[[VAL_2]] : i1
+// CHECK:       }
+
+func.func @measure_handle_mx_vec(%v : !quake.veq<?>) -> !cc.stdvec<i1> {
+  %m = quake.mx %v : (!quake.veq<?>) -> !cc.stdvec<!cc.measure_handle>
+  %b = quake.discriminate %m
+      : (!cc.stdvec<!cc.measure_handle>) -> !cc.stdvec<i1>
+  return %b : !cc.stdvec<i1>
+}
+
+// CHECK-LABEL: func.func @measure_handle_mx_vec(
+// CHECK-SAME:    %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> {
+// CHECK:         %[[VAL_1:.*]] = quake.mx %[[VAL_0]] : (!quake.veq<?>) -> !cc.stdvec<!cc.measure_handle>
+// CHECK:         %[[VAL_2:.*]] = quake.discriminate %[[VAL_1]] : (!cc.stdvec<!cc.measure_handle>) -> !cc.stdvec<i1>
+// CHECK:         return %[[VAL_2]] : !cc.stdvec<i1>
+// CHECK:       }

From 0376c3d9400567da259d6cde83ab531760e66043 Mon Sep 17 00:00:00 2001
From: Mitchell <mitch_dz@hotmail.com>
Date: Wed, 29 Apr 2026 15:50:06 -0700
Subject: [PATCH 78/85] remove python 3.10, add python 3.13 to docs (#4399)

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 python/README.md.in                | 2 +-
 python/metapackages/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/README.md.in b/python/README.md.in
index 660c39c56ea..ca1a9857b1f 100644
--- a/python/README.md.in
+++ b/python/README.md.in
@@ -74,7 +74,7 @@ source $CONDA_PREFIX/lib/python3.11/site-packages/distributed_interfaces/activat
 [//]: # (End conda install)
 
 **Warning (conda-forge)**: Installing `cudaq` from the `conda-forge` channel on recent versions of Python can lead to segmentation faults.
-If you see such errors, please switch to one of the Python versions 3.10.15, 3.11.10, or 3.12.7 as per the issue
+If you see such errors, please switch to one of the Python versions 3.11.10 or 3.12.7 as per the issue
 [#2999](https://github.com/NVIDIA/cuda-quantum/issues/2999) or [#3104](https://github.com/NVIDIA/cuda-quantum/issues/3104).
 
 You must configure MPI by setting the following environment variables:
diff --git a/python/metapackages/pyproject.toml b/python/metapackages/pyproject.toml
index b056362b507..92db31c4acd 100644
--- a/python/metapackages/pyproject.toml
+++ b/python/metapackages/pyproject.toml
@@ -21,9 +21,9 @@ classifiers = [
     'Intended Audience :: Developers',
     'Programming Language :: Python',
     'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
     'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
     "Environment :: GPU :: NVIDIA CUDA",
     "Environment :: GPU :: NVIDIA CUDA :: 11",
     "Environment :: GPU :: NVIDIA CUDA :: 12",

From cb19eaa534e830988d05d7d22bbee6e0e8c471cd Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Wed, 29 Apr 2026 16:44:38 -0700
Subject: [PATCH 79/85] Fixes a miscompile of cc.loop while-loop (#4412)

Fixes a miscompile of cc.loop while-loop with closed-interval
comparisons (>=, <=).

- Adding uge/sge to isClosedIntervalForm.
- After the loop, replacing external uses of the induction-position
result

Fixes #4401

---------

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
---
 lib/Optimizer/Transforms/LoopAnalysis.cpp     |   8 +-
 .../Transforms/LoopNormalizePatterns.inc      |  18 ++++
 python/tests/kernel/test_run_kernel.py        |  60 +++++++++++
 .../loop_normal_closed_interval.qke           | 101 ++++++++++++++++++
 4 files changed, 185 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/loop_normal_closed_interval.qke

diff --git a/lib/Optimizer/Transforms/LoopAnalysis.cpp b/lib/Optimizer/Transforms/LoopAnalysis.cpp
index f4aa933173a..37fe5e98f33 100644
--- a/lib/Optimizer/Transforms/LoopAnalysis.cpp
+++ b/lib/Optimizer/Transforms/LoopAnalysis.cpp
@@ -78,6 +78,10 @@ static bool isClosedIntervalForm(arith::CmpIPredicate p) {
   return p == arith::CmpIPredicate::ule || p == arith::CmpIPredicate::sle;
 }
 
+static bool isClosedIntervalDownForm(arith::CmpIPredicate p) {
+  return p == arith::CmpIPredicate::uge || p == arith::CmpIPredicate::sge;
+}
+
 static bool isSemiOpenIntervalForm(arith::CmpIPredicate p) {
   return p == arith::CmpIPredicate::ult || p == arith::CmpIPredicate::slt ||
          p == arith::CmpIPredicate::ne;
@@ -332,8 +336,8 @@ bool opt::LoopComponents::shouldCommuteStepOp() const {
 }
 
 bool opt::LoopComponents::isClosedIntervalForm() const {
-  auto cmp = cast<arith::CmpIOp>(compareOp);
-  return ::isClosedIntervalForm(cmp.getPredicate());
+  auto p = cast<arith::CmpIOp>(compareOp).getPredicate();
+  return ::isClosedIntervalForm(p) || ::isClosedIntervalDownForm(p);
 }
 
 bool opt::LoopComponents::isLinearExpr() const {
diff --git a/lib/Optimizer/Transforms/LoopNormalizePatterns.inc b/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
index 8152fccf713..eb9b7d33cd5 100644
--- a/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
+++ b/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
@@ -156,6 +156,24 @@ public:
                !isa<cudaq::cc::ContinueOp>(op);
       });
     }
+
+    // 5) Back-convert the loop's induction-position result for external uses
+    if (c.induction < loop.getNumResults()) {
+      Value loopResult = loop.getResult(c.induction);
+      if (!loopResult.use_empty()) {
+        rewriter.setInsertionPointAfter(loop);
+        auto mulRes =
+            rewriter.create<arith::MulIOp>(loc, loopResult, c.stepValue);
+        Value recovered;
+        if (c.stepIsAnAddOp())
+          recovered =
+              rewriter.create<arith::AddIOp>(loc, c.initialValue, mulRes);
+        else
+          recovered =
+              rewriter.create<arith::SubIOp>(loc, c.initialValue, mulRes);
+        loopResult.replaceAllUsesExcept(recovered, mulRes.getOperation());
+      }
+    }
     loop->setAttr(cudaq::opt::NormalizedLoopAttr, rewriter.getUnitAttr());
 
     rewriter.finalizeRootUpdate(loop);
diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py
index 5ac44ea0286..77a7cbe82b5 100644
--- a/python/tests/kernel/test_run_kernel.py
+++ b/python/tests/kernel/test_run_kernel.py
@@ -1619,6 +1619,66 @@ def kernel(cond: bool) -> int:
     assert results[0] == 0
 
 
+def test_while_loop_countdown_sge():
+
+    @cudaq.kernel
+    def kernel() -> int:
+        val = 3
+        while val >= 0:
+            val -= 1
+        return val
+
+    results = cudaq.run(kernel, shots_count=1)
+    assert len(results) == 1
+    assert results[0] == -1
+
+
+def test_while_loop_countup_sle():
+
+    @cudaq.kernel
+    def kernel() -> int:
+        val = 5
+        while val <= 10:
+            val += 1
+        return val
+
+    results = cudaq.run(kernel, shots_count=1)
+    assert len(results) == 1
+    assert results[0] == 11
+
+
+def test_while_loop_sge_with_quantum_gates():
+
+    @cudaq.kernel
+    def kernel() -> int:
+        q = cudaq.qvector(3)
+        val = 2
+        while val >= 0:
+            x(q[val])
+            val -= 1
+        return mz(q[0])
+
+    results = cudaq.run(kernel, shots_count=1)
+    assert len(results) == 1
+    assert results[0] == 1
+
+
+def test_while_loop_sgt_workaround_still_works():
+
+    @cudaq.kernel
+    def kernel() -> int:
+        q = cudaq.qvector(3)
+        val = 2
+        while val > -1:
+            x(q[val])
+            val -= 1
+        return mz(q[0])
+
+    results = cudaq.run(kernel, shots_count=1)
+    assert len(results) == 1
+    assert results[0] == 1
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/test/Transforms/loop_normal_closed_interval.qke b/test/Transforms/loop_normal_closed_interval.qke
new file mode 100644
index 00000000000..0c82dca66e3
--- /dev/null
+++ b/test/Transforms/loop_normal_closed_interval.qke
@@ -0,0 +1,101 @@
+// ========================================================================== //
+// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt --cc-loop-normalize --canonicalize --cc-loop-unroll --canonicalize %s | FileCheck %s
+
+// Regression test: a `while val >= 0: val -= 1` loop starting at 3 must run 4
+// iterations (closed interval) and yield val = -1.
+
+func.func @countdown_sge() -> i64 {
+  %c1_i64 = arith.constant 1 : i64
+  %c0_i64 = arith.constant 0 : i64
+  %c3_i64 = arith.constant 3 : i64
+  %0 = cc.loop while ((%arg0 = %c3_i64) -> (i64)) {
+    %1 = arith.cmpi sge, %arg0, %c0_i64 : i64
+    cc.condition %1(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %1 = arith.subi %arg0, %c1_i64 : i64
+    cc.continue %1 : i64
+  } step {
+  ^bb0(%arg0: i64):
+    cc.continue %arg0 : i64
+  }
+  return %0 : i64
+}
+
+// CHECK-LABEL:   func.func @countdown_sge() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant -1 : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
+
+// Regression test: a `while val <= 10: val += 1` loop starting at 5 must run 6
+// iterations (closed interval) and yield val = 11.
+
+func.func @countup_sle() -> i64 {
+  %c1_i64 = arith.constant 1 : i64
+  %c10_i64 = arith.constant 10 : i64
+  %c5_i64 = arith.constant 5 : i64
+  %0 = cc.loop while ((%arg0 = %c5_i64) -> (i64)) {
+    %1 = arith.cmpi sle, %arg0, %c10_i64 : i64
+    cc.condition %1(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %1 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %1 : i64
+  } step {
+  ^bb0(%arg0: i64):
+    cc.continue %arg0 : i64
+  }
+  return %0 : i64
+}
+
+// CHECK-LABEL:   func.func @countup_sle() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 11 : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
+
+func.func @countdown_sgt() -> i64 {
+  %c1_i64 = arith.constant 1 : i64
+  %c0_i64 = arith.constant 0 : i64
+  %c3_i64 = arith.constant 3 : i64
+  %0 = cc.loop while ((%arg0 = %c3_i64) -> (i64)) {
+    %1 = arith.cmpi sgt, %arg0, %c0_i64 : i64
+    cc.condition %1(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %1 = arith.subi %arg0, %c1_i64 : i64
+    cc.continue %1 : i64
+  }
+  return %0 : i64
+}
+
+// CHECK-LABEL:   func.func @countdown_sgt() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }
+
+func.func @countup_slt() -> i64 {
+  %c1_i64 = arith.constant 1 : i64
+  %c10_i64 = arith.constant 10 : i64
+  %c5_i64 = arith.constant 5 : i64
+  %0 = cc.loop while ((%arg0 = %c5_i64) -> (i64)) {
+    %1 = arith.cmpi slt, %arg0, %c10_i64 : i64
+    cc.condition %1(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %1 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %1 : i64
+  }
+  return %0 : i64
+}
+
+// CHECK-LABEL:   func.func @countup_slt() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 10 : i64
+// CHECK:           return %[[VAL_0]] : i64
+// CHECK:         }

From 5c9a3c6f408d973f259b9f20178de953abb6cca4 Mon Sep 17 00:00:00 2001
From: Sachin Pisal <spisal@nvidia.com>
Date: Wed, 29 Apr 2026 21:22:18 -0700
Subject: [PATCH 80/85] Restoring static linkage (#4417)

Move the anonymous namespace to wrap only BufferInfo and mark
hostDataFromDevice plus the CuPy helper functions static, matching the
convention used elsewhere in the file.

Signed-off-by: Sachin Pisal <spisal@nvidia.com>
---
 python/runtime/cudaq/algorithms/py_state.cpp | 36 ++++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index 6b7f774aabc..15c8c0a63b5 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -28,15 +28,13 @@
 
 using namespace cudaq;
 
-namespace {
-
 // FIXME: This is using a thread unsafe global?
 /// If we have any implicit device-to-host data transfers we will store that
 /// data here and ensure it is deleted properly.
-// Keep implicit host copies alive for Python buffer interop.
-std::vector<std::unique_ptr<void, std::function<void(void *)>>>
+static std::vector<std::unique_ptr<void, std::function<void(void *)>>>
     hostDataFromDevice;
 
+namespace {
 // CuPy interop helpers.
 struct BufferInfo {
   void *ptr = nullptr;
@@ -47,8 +45,9 @@ struct BufferInfo {
   bool readonly = false;
   std::size_t size = 0;
 };
+} // namespace
 
-nanobind::dict getCupyArrayInterface(nanobind::handle cupyArray) {
+static nanobind::dict getCupyArrayInterface(nanobind::handle cupyArray) {
   if (!nanobind::hasattr(cupyArray, "__cuda_array_interface__"))
     throw std::runtime_error("Buffer is not a CuPy array");
 
@@ -57,7 +56,7 @@ nanobind::dict getCupyArrayInterface(nanobind::handle cupyArray) {
           "__cuda_array_interface__"));
 }
 
-std::vector<ssize_t>
+static std::vector<ssize_t>
 getCContiguousStrides(const std::vector<std::size_t> &shape,
                       std::size_t itemsize) {
   std::vector<ssize_t> strides(shape.size(), itemsize);
@@ -70,9 +69,10 @@ getCContiguousStrides(const std::vector<std::size_t> &shape,
   return strides;
 }
 
-std::vector<ssize_t> getCupyArrayStrides(const nanobind::dict &cupyArrayInfo,
-                                         const std::vector<std::size_t> &shape,
-                                         std::size_t itemsize) {
+static std::vector<ssize_t>
+getCupyArrayStrides(const nanobind::dict &cupyArrayInfo,
+                    const std::vector<std::size_t> &shape,
+                    std::size_t itemsize) {
   auto stridesObj = cupyArrayInfo["strides"];
   if (stridesObj.is_none())
     return getCContiguousStrides(shape, itemsize);
@@ -86,7 +86,7 @@ std::vector<ssize_t> getCupyArrayStrides(const nanobind::dict &cupyArrayInfo,
   return strides;
 }
 
-std::pair<std::size_t, std::string>
+static std::pair<std::size_t, std::string>
 getCupyComplexTypeInfo(const std::string &typeStr) {
   if (typeStr == "<c8")
     return {sizeof(std::complex<float>), "Zf"};
@@ -97,7 +97,7 @@ getCupyComplexTypeInfo(const std::string &typeStr) {
                            ". Supported types are: <c16 and <c8.");
 }
 
-BufferInfo getCupyBufferInfo(nanobind::object cupyArray) {
+static BufferInfo getCupyBufferInfo(nanobind::object cupyArray) {
   auto cupyArrayInfo = getCupyArrayInterface(cupyArray);
   auto dataInfo = nanobind::cast<nanobind::tuple>(cupyArrayInfo["data"]);
   auto shapeTuple = nanobind::cast<nanobind::tuple>(cupyArrayInfo["shape"]);
@@ -125,8 +125,9 @@ BufferInfo getCupyBufferInfo(nanobind::object cupyArray) {
   return info;
 }
 
-bool isCContiguous(const std::vector<std::size_t> &shape,
-                   const std::vector<ssize_t> &strides, std::size_t itemsize) {
+static bool isCContiguous(const std::vector<std::size_t> &shape,
+                          const std::vector<ssize_t> &strides,
+                          std::size_t itemsize) {
   // Treat inconsistent metadata as non-contiguous so callers fall back to
   // canonicalization instead of taking an unsafe fast path.
   if (shape.size() != strides.size())
@@ -145,8 +146,8 @@ bool isCContiguous(const std::vector<std::size_t> &shape,
   return true;
 }
 
-bool shouldCanonicalizeCupyArray(const BufferInfo &info,
-                                 const std::string &targetName) {
+static bool shouldCanonicalizeCupyArray(const BufferInfo &info,
+                                        const std::string &targetName) {
   if (info.shape.empty())
     return false;
 
@@ -157,13 +158,12 @@ bool shouldCanonicalizeCupyArray(const BufferInfo &info,
   return needsCanon && !isCContiguous(info.shape, info.strides, info.itemsize);
 }
 
-nanobind::object canonicalizeCupyArrayToNumpy(nanobind::handle cupyArray) {
+static nanobind::object
+canonicalizeCupyArrayToNumpy(nanobind::handle cupyArray) {
   return nanobind::module_::import_("cupy").attr("asnumpy")(
       nanobind::borrow<nanobind::object>(cupyArray));
 }
 
-} // namespace
-
 static std::vector<int> bitStringToIntVec(const std::string &bitString) {
   // Check that this is a valid bit string.
   const bool isValidBitString =

From 4a376513f0086c8a3ef1d6aa9e49c038490c0e00 Mon Sep 17 00:00:00 2001
From: Thomas Alexander <thomasalexander2718@gmail.com>
Date: Thu, 30 Apr 2026 09:13:59 -0300
Subject: [PATCH 81/85] Expose tracer to Python and wrap standard entry points
 (#4390)

## Summary

- Binds the runtime `Tracer` as `cudaq.util.trace` with:
  - `span(name, **kwargs)` context manager
- `traced(name=None)` decorator (name defaults to `fn.__module__ + "." +
fn.__qualname__`)
- `TraceBackend` / `ChromeBackend` / `SpdlogBackend` first-class classes
  - `set_backend` / `get_backend` / `reset_backend`
- Backends constructed via `std::make_shared` factories (`nb::new_`).
Python
 wrapper and C++ Tracer slot each hold an independent `shared_ptr`, so
Python finalization tears down wrappers without the C++ slot losing its
reference, and C++ static destruction runs the ChromeBackend file write
cleanly without touching Python state.
- `ChromeBackend` exposes `to_json` / `to_dict` / `write_file` / `clear`
for
 in-memory inspection with no filesystem round trip.
- Applies `@trace.traced` to every public kernel-action entry point
(`sample`, `observe`, `run`, `get_state`, `get_unitary`,
`estimate_resources`, `draw`, `translate`, `evolve`, `ptsbe.sample`, and
async variants) and to `PyKernelDecorator.compile` / `prepare_call`,
with a
nested `kernel.clone_module` span around the `cudaq_runtime.cloneModule`
call.


## Dependencies

Stacks on PR #4389 Rebase onto main after PR 1
merges and retarget the PR base.

---------

Signed-off-by: Thomas Alexander <talexander@nvidia.com>
---
 python/cudaq/dynamics/evolution.py            |   3 +
 python/cudaq/kernel/ast_bridge.py             |  28 +++-
 python/cudaq/kernel/kernel_builder.py         |   4 +-
 python/cudaq/kernel/kernel_decorator.py       |   8 +-
 python/cudaq/runtime/draw.py                  |   2 +
 python/cudaq/runtime/observe.py               |   3 +
 python/cudaq/runtime/ptsbe.py                 |   3 +
 python/cudaq/runtime/resource_count.py        |   2 +
 python/cudaq/runtime/run.py                   |   3 +
 python/cudaq/runtime/sample.py                |   3 +
 python/cudaq/runtime/state.py                 |   3 +
 python/cudaq/runtime/translate.py             |   2 +
 python/cudaq/runtime/unitary.py               |   2 +
 python/cudaq/util/trace.py                    |  85 +++++++++++
 python/extension/CMakeLists.txt               |   1 +
 python/extension/CUDAQuantumExtension.cpp     |  15 ++
 .../runtime/cudaq/algorithms/py_translate.cpp |  10 ++
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  15 ++
 python/runtime/cudaq/trace/py_trace.cpp       | 137 ++++++++++++++++++
 python/runtime/cudaq/trace/py_trace.h         |  17 +++
 python/tests/utils/trace/test_trace_chrome.py | 124 ++++++++++++++++
 python/tests/utils/trace/test_trace_span.py   |  60 ++++++++
 .../cudaq/runtime/logger/chrome_tracer.h      |   4 +-
 23 files changed, 519 insertions(+), 15 deletions(-)
 create mode 100644 python/cudaq/util/trace.py
 create mode 100644 python/runtime/cudaq/trace/py_trace.cpp
 create mode 100644 python/runtime/cudaq/trace/py_trace.h
 create mode 100644 python/tests/utils/trace/test_trace_chrome.py
 create mode 100644 python/tests/utils/trace/test_trace_span.py

diff --git a/python/cudaq/dynamics/evolution.py b/python/cudaq/dynamics/evolution.py
index a0cee154358..baa3a1b8e99 100644
--- a/python/cudaq/dynamics/evolution.py
+++ b/python/cudaq/dynamics/evolution.py
@@ -19,6 +19,7 @@
 from cudaq.kernel.register_op import register_operation
 from cudaq.kernel.utils import ahkPrefix
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
+from cudaq.util import trace
 from ..operators import NumericType, Operator, RydbergHamiltonian, SuperOperator
 from .helpers import InitialState, InitialStateArgT, IntermediateResultSave
 from .integrator import BaseIntegrator
@@ -311,6 +312,7 @@ def evolve_single(
 
 
 # Top level API for the CUDA-Q master equation solver.
+@trace.traced
 def evolve(
     hamiltonian: Operator | SuperOperator | Sequence[Operator] |
     Sequence[SuperOperator],
@@ -589,6 +591,7 @@ def evolve_single_async(
                                           shots_count=shots_count)
 
 
+@trace.traced
 def evolve_async(
     hamiltonian: Operator,
     dimensions: Mapping[int, int] = {},
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index f568867381a..2f0aac710df 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -28,6 +28,7 @@
                            InsertionPoint, IntegerAttr, IntegerType, Location,
                            Module, StringAttr, SymbolTable, TypeAttr, UnitAttr)
 from cudaq.mlir.passmanager import PassManager
+from cudaq.util import trace
 from .analysis import ValidateArgumentAnnotations, ValidateReturnStatements
 from .kernel_signature import KernelSignature
 from .utils import (Color, globalRegisteredOperations, globalRegisteredTypes,
@@ -5483,17 +5484,28 @@ def compile_to_mlir(uniqueId, astModule, signature: KernelSignature, defFrame,
                          kernelModuleName=kernelModuleName,
                          cudaqAliases=cudaqAliases)
 
-    ValidateArgumentAnnotations(bridge).visit(astModule)
-    ValidateReturnStatements(bridge).visit(astModule)
-
-    # Build the AOT Quake Module for this kernel.
-    bridge.visit(astModule)
-
-    # Precompile (simplify) the Module.
+    # Build the AOT Quake Module for this kernel. Wrapped in a single span so
+    # the tracer can separate Python-AST-to-MLIR construction from the AOT
+    # pass pipeline that runs immediately after.
+    with trace.span("ast_bridge.build_module"):
+        ValidateArgumentAnnotations(bridge).visit(astModule)
+        ValidateReturnStatements(bridge).visit(astModule)
+        bridge.visit(astModule)
+
+    # Precompile (simplify) the Module. Run via `cudaq_runtime.runPassManager`
+    # so `TracePassInstrumentation` is installed (matching the JIT-side
+    # install at `runtime/internal/compiler/RuntimePyMLIR.cpp`). Without this,
+    # AOT passes execute through upstream MLIR's `pm.run()` without a tracer
+    # attached and per-pass wall-time cannot be attributed.
+    #
+    # The `cudaq.pipeline.aot` span is the marker tooling uses to identify
+    # pass events as AOT-pipeline (paired with `cudaq.pipeline.jit` emitted
+    # from `QPU.cpp` `lower_to_qir_llvm`).
     pm = PassManager.parse("builtin.module(aot-prep-pipeline)",
                            context=bridge.ctx)
     try:
-        pm.run(bridge.module)
+        with trace.span("cudaq.pipeline.aot"):
+            cudaq_runtime.runPassManager(pm, bridge.module)
     except:
         raise RuntimeError(f"could not compile code for '{bridge.name}'.")
 
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 4abc9fb8733..68627126007 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -732,7 +732,7 @@ def __str__(self, canonicalize=True):
                 "cse,quake-add-metadata),quake-propagate-metadata)",
                 context=self.ctx)
             cloned = cudaq_runtime.cloneModule(self.module)
-            pm.run(cloned)
+            cudaq_runtime.runPassManager(pm, cloned)
             return str(cloned)
         return str(self.module)
 
@@ -1643,7 +1643,7 @@ def compile(self):
             pm = PassManager.parse("builtin.module(aot-prep-pipeline)",
                                    context=ctx)
             try:
-                pm.run(self.qkeModule)
+                cudaq_runtime.runPassManager(pm, self.qkeModule)
             except:
                 raise RuntimeError("could not compile code for '" +
                                    self.uniqName + "'.")
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index aea0b992974..98787d1844f 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -17,6 +17,7 @@
 
 from cudaq.handlers import get_target_handler
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
+from cudaq.util import trace
 from cudaq.mlir.dialects import cc, func
 from cudaq.mlir.ir import (ComplexType, F32Type, F64Type, FunctionType,
                            IntegerType, NoneType, TypeAttr, UnitAttr, Module,
@@ -255,6 +256,7 @@ def supports_compilation(self):
         handler = get_target_handler()
         return not handler.skip_compilation()
 
+    @trace.traced
     def compile(self):
         """
         Compile the Python AST to portable Quake.
@@ -558,6 +560,7 @@ def process_call_arguments(self, *args, allow_no_args=False):
 
         return args
 
+    @trace.traced
     def prepare_call(self, *args, allow_no_args=False):
         """
         Process call site arguments, capture lifted arguments and retrieve
@@ -566,7 +569,7 @@ def prepare_call(self, *args, allow_no_args=False):
         # Returns:
 
         `processed_args` : list
-            The list of processed runtime arguments, including captured arguments, 
+            The list of processed runtime arguments, including captured arguments,
         `module` : Module
             A clone of the MLIR module to be used for kernel execution.
         """
@@ -575,7 +578,8 @@ def prepare_call(self, *args, allow_no_args=False):
         # append captured arguments
         processed_args.extend(self.resolve_captured_arguments())
 
-        module = cudaq_runtime.cloneModule(self.qkeModule)
+        with trace.span("kernel.clone_module"):
+            module = cudaq_runtime.cloneModule(self.qkeModule)
 
         return processed_args, module
 
diff --git a/python/cudaq/runtime/draw.py b/python/cudaq/runtime/draw.py
index b1b23b9718f..144fee55ddd 100644
--- a/python/cudaq/runtime/draw.py
+++ b/python/cudaq/runtime/draw.py
@@ -8,6 +8,7 @@
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
+from cudaq.util import trace
 
 
 def _detail_draw(format, decorator, *args):
@@ -23,6 +24,7 @@ def _detail_draw(format, decorator, *args):
                                    *processedArgs)
 
 
+@trace.traced
 def draw(decoratorOrFormat, *args):
     """
     The CUDA-Q specification overloads draw. To meet that, this function uses
diff --git a/python/cudaq/runtime/observe.py b/python/cudaq/runtime/observe.py
index 6ef59e8e205..ba69a26348c 100644
--- a/python/cudaq/runtime/observe.py
+++ b/python/cudaq/runtime/observe.py
@@ -7,6 +7,7 @@
 # ============================================================================ #
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
+from cudaq.util import trace
 from .utils import __isBroadcast, __createArgumentSet
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
 from cudaq.kernel.kernel_builder import isa_dynamic_kernel
@@ -68,6 +69,7 @@ def __resolveExpectationValue(ctx, spin_operator, sample_result):
     return total
 
 
+@trace.traced
 def observe(kernel,
             spin_operator,
             *args,
@@ -229,6 +231,7 @@ def __computeTermExpectation(term, observe_result):
     return results
 
 
+@trace.traced
 def observe_async(kernel, spin_operator, *args, qpu_id=0, shots_count=-1):
     """
     Compute the expected value of the `spin_operator` with respect to the
diff --git a/python/cudaq/runtime/ptsbe.py b/python/cudaq/runtime/ptsbe.py
index 2f30d9b5458..1aeea0d5ae0 100644
--- a/python/cudaq/runtime/ptsbe.py
+++ b/python/cudaq/runtime/ptsbe.py
@@ -10,6 +10,7 @@
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
 from cudaq.runtime.sample import (_detail_check_conditionals_on_measure,
                                   AsyncSampleResult)
+from cudaq.util import trace
 from .utils import __isBroadcast, __createArgumentSet
 
 from cudaq.mlir._mlir_libs._quakeDialects.cudaq_runtime.ptsbe import *
@@ -47,6 +48,7 @@ def _validate_ptsbe_args(kernel, args, shots_count, noise_model,
     return decorator
 
 
+@trace.traced
 def sample(kernel,
            *args,
            shots_count=1000,
@@ -121,6 +123,7 @@ def sample(kernel,
         include_sequential_data, *processedArgs)
 
 
+@trace.traced
 def sample_async(kernel,
                  *args,
                  shots_count=1000,
diff --git a/python/cudaq/runtime/resource_count.py b/python/cudaq/runtime/resource_count.py
index 86175fd7bf1..0e6ffee1ee4 100644
--- a/python/cudaq/runtime/resource_count.py
+++ b/python/cudaq/runtime/resource_count.py
@@ -8,8 +8,10 @@
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
+from cudaq.util import trace
 
 
+@trace.traced
 def estimate_resources(kernel, *args, **kwargs):
     """
     Performs resource counting on the given quantum kernel expression and
diff --git a/python/cudaq/runtime/run.py b/python/cudaq/runtime/run.py
index 52c06e3f602..0fce071dca0 100644
--- a/python/cudaq/runtime/run.py
+++ b/python/cudaq/runtime/run.py
@@ -9,6 +9,7 @@
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
 from cudaq.mlir.ir import UnitAttr
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
+from cudaq.util import trace
 import numpy as np
 
 # Maintain a dictionary of queued `async` run kernels. This dictionary is used
@@ -45,6 +46,7 @@ def __del__(self):
             del (cudaq_async_run_module_cache[self.counter])
 
 
+@trace.traced
 def run(decorator, *args, shots_count=100, noise_model=None, qpu_id=0):
     if isa_kernel_decorator(decorator):
         if not decorator.supports_compilation():
@@ -65,6 +67,7 @@ def run(decorator, *args, shots_count=100, noise_model=None, qpu_id=0):
                                   *processedArgs)
 
 
+@trace.traced
 def run_async(decorator, *args, shots_count=100, noise_model=None, qpu_id=0):
     """
 Run the provided `kernel` with the given kernel `arguments` over the specified
diff --git a/python/cudaq/runtime/sample.py b/python/cudaq/runtime/sample.py
index b2c97d97800..21975599e43 100644
--- a/python/cudaq/runtime/sample.py
+++ b/python/cudaq/runtime/sample.py
@@ -10,6 +10,7 @@
 from cudaq.kernel.kernel_builder import PyKernel
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
 from cudaq.kernel.utils import mlirTypeToPyType, nvqppPrefix
+from cudaq.util import trace
 from .utils import __isBroadcast, __createArgumentSet
 
 # Maintain a dictionary of queued `async` sample kernels.This dictionary is used
@@ -115,6 +116,7 @@ def _detail_check_explicit_measurements(explicit_measurements):
             "on this target.")
 
 
+@trace.traced
 def sample(kernel,
            *args,
            shots_count=1000,
@@ -201,6 +203,7 @@ def sample(kernel,
     return counts
 
 
+@trace.traced
 def sample_async(decorator,
                  *args,
                  shots_count=1000,
diff --git a/python/cudaq/runtime/state.py b/python/cudaq/runtime/state.py
index 4c642a599f3..f279e7c91cb 100644
--- a/python/cudaq/runtime/state.py
+++ b/python/cudaq/runtime/state.py
@@ -9,8 +9,10 @@
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
 from cudaq.handlers import get_target_handler
+from cudaq.util import trace
 
 
+@trace.traced
 def get_state(kernel, *args):
     """
     Return the :class:`State` of the system after execution of the provided
@@ -48,6 +50,7 @@ def get_state(kernel, *args):
                                         *processedArgs)
 
 
+@trace.traced
 def get_state_async(kernel, *args, qpu_id=0):
     """
     Asynchronously retrieve the state generated by the given quantum kernel. 
diff --git a/python/cudaq/runtime/translate.py b/python/cudaq/runtime/translate.py
index 466361e27e7..da090fcd8e9 100644
--- a/python/cudaq/runtime/translate.py
+++ b/python/cudaq/runtime/translate.py
@@ -9,8 +9,10 @@
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
 from cudaq.kernel.kernel_builder import isa_dynamic_kernel
+from cudaq.util import trace
 
 
+@trace.traced
 def translate(kernel, *args, format="qir:0.1"):
     """
     Return a `UTF-8` encoded string representing drawing of the execution path,
diff --git a/python/cudaq/runtime/unitary.py b/python/cudaq/runtime/unitary.py
index cf9d951a23d..5ab0057c60f 100644
--- a/python/cudaq/runtime/unitary.py
+++ b/python/cudaq/runtime/unitary.py
@@ -8,8 +8,10 @@
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
 from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
+from cudaq.util import trace
 
 
+@trace.traced
 def get_unitary(kernel, *args):
     """
     Return the unitary matrix of the execution path of the provided kernel.
diff --git a/python/cudaq/util/trace.py b/python/cudaq/util/trace.py
new file mode 100644
index 00000000000..49f6f5b098f
--- /dev/null
+++ b/python/cudaq/util/trace.py
@@ -0,0 +1,85 @@
+# ============================================================================ #
+# Copyright (c) 2026 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+"""Public API for the CUDA-Q span tracer.
+
+The CUDA-Q process shares a single ``Tracer`` that routes begin / end
+events to an installed backend. A backend placed here via ``set_backend``
+captures events emitted from Python ``span(...)``, C++ ``ScopedTrace``
+sites throughout the runtime, and the MLIR ``TracePassInstrumentation``
+attached at every ``PassManager`` site.
+
+Construct a backend, install it with ``set_backend``, emit spans, then
+query the backend directly. ``ChromeBackend`` buffers events in memory
+and writes `Chrome Trace Event Format
+<https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU>`
+JSON to a file on destruction when a path is given. ``SpdlogBackend``
+routes events through the existing log. ``reset_backend()`` clears the
+installed backend and disables capture.
+
+``traced(name)`` is a decorator form of ``span``.
+"""
+
+from functools import wraps
+
+from ..mlir._mlir_libs._quakeDialects.cudaq_runtime.trace import (
+    span,
+    TraceBackend,
+    ChromeBackend,
+    SpdlogBackend,
+    set_backend,
+    get_backend,
+    reset_backend,
+)
+
+
+def traced(name=None):
+    """Decorator that wraps a callable in a ``span``.
+
+    Each invocation opens a span under the ``python`` category and emits
+    a span event when the call returns. The wrapped function's name,
+    documentation, and signature are preserved through ``functools.wraps``.
+
+    If ``name`` is omitted, the span name is the fully qualified dotted
+    path of the wrapped function (``fn.__module__`` joined with
+    ``fn.__qualname__``).
+
+        @trace.traced()                # inherits the function's path
+        def compute(): ...
+
+        @trace.traced("my_phase")      # explicit override
+        def compute(): ...
+    """
+
+    def decorator(fn):
+        effective = name or f"{fn.__module__}.{fn.__qualname__}"
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with span(effective):
+                return fn(*args, **kwargs)
+
+        return wrapper
+
+    # Allow use as bare `@traced` (no parentheses).
+    if callable(name):
+        fn = name
+        name = None
+        return decorator(fn)
+    return decorator
+
+
+__all__ = [
+    "span",
+    "traced",
+    "TraceBackend",
+    "ChromeBackend",
+    "SpdlogBackend",
+    "set_backend",
+    "get_backend",
+    "reset_backend",
+]
diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index 9bb1a0023d8..da035cc75cf 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -113,6 +113,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../runtime/cudaq/operators/py_handlers.cpp
     ../runtime/cudaq/target/py_runtime_target.cpp
     ../runtime/cudaq/target/py_testing_utils.cpp
+    ../runtime/cudaq/trace/py_trace.cpp
     ../runtime/mlir/py_register_dialects.cpp
     ../runtime/utils/PyRemoteSimulatorQPU.cpp
     ../runtime/utils/PyRestRemoteClient.cpp
diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp
index ac1ca729446..a1a581b680d 100644
--- a/python/extension/CUDAQuantumExtension.cpp
+++ b/python/extension/CUDAQuantumExtension.cpp
@@ -10,6 +10,7 @@
 #include "cudaq/Support/Version.h"
 #include "cudaq/platform/orca/orca_qpu.h"
 #include "cudaq/runtime/logger/logger.h"
+#include "cudaq_internal/compiler/RuntimeMLIR.h"
 #include "runtime/common/py_AnalogHamiltonian.h"
 #include "runtime/common/py_CustomOpRegistry.h"
 #include "runtime/common/py_EvolveResult.h"
@@ -43,11 +44,13 @@
 #include "runtime/cudaq/qis/py_pauli_word.h"
 #include "runtime/cudaq/target/py_runtime_target.h"
 #include "runtime/cudaq/target/py_testing_utils.h"
+#include "runtime/cudaq/trace/py_trace.h"
 #include "runtime/interop/PythonCppInteropDecls.h"
 #include "runtime/mlir/py_register_dialects.h"
 #include "utils/LinkedLibraryHolder.h"
 #include "utils/NanobindAdaptors.h"
 #include "utils/OpaqueArguments.h"
+#include "mlir/CAPI/Pass.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include <nanobind/stl/complex.h>
@@ -134,6 +137,7 @@ NB_MODULE(_quakeDialects, m) {
   });
   bindTestUtils(cudaqRuntime, *holder.get());
   bindCustomOpRegistry(cudaqRuntime);
+  bindTrace(cudaqRuntime);
 
   cudaqRuntime.def("set_random_seed", &set_random_seed,
                    "Provide the seed for backend quantum kernel simulation.");
@@ -280,6 +284,17 @@ NB_MODULE(_quakeDialects, m) {
       nanobind::arg("id"));
   cudaqRuntime.def("cloneModule",
                    [](MlirModule mod) { return wrap(unwrap(mod).clone()); });
+  cudaqRuntime.def(
+      "runPassManager",
+      [](MlirPassManager pm, MlirModule mod) {
+        if (mlir::failed(cudaq_internal::compiler::runPassManager(
+                *unwrap(pm), unwrap(mod).getOperation())))
+          throw std::runtime_error("pass pipeline failed");
+      },
+      "Run an MLIR PassManager on a Module via the runtime helper that "
+      "installs TracePassInstrumentation and releases the GIL. Used by "
+      "cudaq.mlir.passmanager.PassManager.run() so every Python-side pass "
+      "run is traced through the same chokepoint as the JIT path.");
   cudaqRuntime.def("isTerminator", [](MlirOperation op) {
     return unwrap(op)->hasTrait<mlir::OpTrait::IsTerminator>();
   });
diff --git a/python/runtime/cudaq/algorithms/py_translate.cpp b/python/runtime/cudaq/algorithms/py_translate.cpp
index 05eb776a30f..86faaa47c79 100644
--- a/python/runtime/cudaq/algorithms/py_translate.cpp
+++ b/python/runtime/cudaq/algorithms/py_translate.cpp
@@ -25,6 +25,16 @@ using namespace mlir;
 static std::string translate_impl(const std::string &shortName,
                                   MlirModule module, const std::string &format,
                                   nanobind::args runtimeArguments) {
+  // Marker span identifying every nested pass / scoped trace as part of the
+  // JIT-time pipeline triggered by cudaq.translate. The primary JIT marker
+  // for kernel-call / sample / observe / estimate_resources lives in
+  // cudaq::marshal_and_launch_module; cudaq.translate has its own JIT
+  // pipeline that does not pass through that function, so it gets its own
+  // marker here. Paired with cudaq.pipeline.aot emitted in compile_to_mlir.
+  cudaq::ScopedTrace pipelineJitMarker(cudaq::TraceContext(__builtin_FUNCTION(),
+                                                           __builtin_FILE(),
+                                                           __builtin_LINE()),
+                                       "cudaq.pipeline.jit");
   StringRef format_ = format;
   auto formatPair = format_.split(':');
   auto mod = unwrap(module);
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index dceb63d9286..e78a1e34d57 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -925,6 +925,21 @@ cudaq::OpaqueArguments cudaq::marshal_arguments_for_module_launch(
 nanobind::object cudaq::marshal_and_launch_module(const std::string &name,
                                                   MlirModule module,
                                                   nanobind::args runtimeArgs) {
+  // Marker span identifying every nested pass / scoped trace as part of the
+  // JIT-time pipeline. Paired with the cudaq.pipeline.aot span emitted around
+  // aot-prep-pipeline in compile_to_mlir; tooling reads the trace ancestry to
+  // attribute pass events to AOT vs JIT.
+  //
+  // This site is the funnel for kernel-call / sample / observe /
+  // estimate_resources execution paths: each ultimately calls
+  // marshal_and_launch_module, so a single span here attributes their JIT
+  // pass events to the JIT pipeline. The cudaq.translate path has its own
+  // marker in py_translate.cpp::translate_impl since it does not pass
+  // through this function.
+  cudaq::ScopedTrace pipelineJitMarker(cudaq::TraceContext(__builtin_FUNCTION(),
+                                                           __builtin_FILE(),
+                                                           __builtin_LINE()),
+                                       "cudaq.pipeline.jit");
   ScopedTraceWithContext("marshal_and_launch_module", name);
   auto kernelFunc = getKernelFuncOp(module, name);
   auto mod = unwrap(module);
diff --git a/python/runtime/cudaq/trace/py_trace.cpp b/python/runtime/cudaq/trace/py_trace.cpp
new file mode 100644
index 00000000000..414ea847a32
--- /dev/null
+++ b/python/runtime/cudaq/trace/py_trace.cpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "py_trace.h"
+#include "cudaq/runtime/logger/chrome_tracer.h"
+#include "cudaq/runtime/logger/spdlog_tracer.h"
+#include "cudaq/runtime/logger/tracer.h"
+#include <memory>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <string>
+#include <utility>
+
+static std::string formatKwargs(const nanobind::kwargs &kwargs) {
+  if (kwargs.size() == 0)
+    return {};
+  std::string out = " (args = {";
+  bool first = true;
+  for (const auto &item : kwargs) {
+    if (!first)
+      out += ", ";
+    first = false;
+    out += nanobind::cast<std::string>(item.first);
+    out += "=";
+    out += nanobind::cast<std::string>(nanobind::str(item.second));
+  }
+  out += "})";
+  return out;
+}
+
+namespace {
+class TraceSpan {
+public:
+  TraceSpan(std::string spanName, std::string spanArgs)
+      : name(std::move(spanName)), args(std::move(spanArgs)) {}
+
+  void enter() {
+    handle = cudaq::Tracer::instance().beginSpan(cudaq::TraceContext{}, name,
+                                                 /*tag=*/0, args, "python");
+  }
+
+  void exit() { cudaq::Tracer::instance().endSpan(std::move(handle)); }
+
+private:
+  std::string name;
+  std::string args;
+  cudaq::SpanHandle handle;
+};
+} // namespace
+
+void cudaq::bindTrace(nanobind::module_ &mod) {
+  auto trace = mod.def_submodule("trace");
+
+  nanobind::class_<TraceSpan>(trace, "span")
+      .def("__init__",
+           [](TraceSpan *self, const std::string &name,
+              nanobind::kwargs kwargs) {
+             new (self) TraceSpan(name, formatKwargs(kwargs));
+           })
+      .def("__enter__", [](TraceSpan &self) { self.enter(); })
+      .def(
+          "__exit__",
+          [](TraceSpan &self, nanobind::object, nanobind::object,
+             nanobind::object) { self.exit(); },
+          nanobind::arg("type").none(), nanobind::arg("value").none(),
+          nanobind::arg("traceback").none());
+
+  // Abstract base used only for upcasting in set_backend / get_backend.
+  nanobind::class_<cudaq::TraceBackend>(trace, "TraceBackend");
+
+  nanobind::class_<cudaq::ChromeTraceBackend, cudaq::TraceBackend>(
+      trace, "ChromeBackend")
+      .def(nanobind::new_([](std::string path) {
+             return std::make_shared<cudaq::ChromeTraceBackend>(
+                 std::move(path));
+           }),
+           nanobind::arg("path") = std::string{},
+           "Construct a Chrome backend. If `path` is empty, the backend is "
+           "pure in-memory and the destructor does not write a file; call "
+           "to_json() / to_dict() / write_file() to retrieve events. If "
+           "`path` is set, the destructor writes Chrome Trace Event Format "
+           "JSON to `path`.")
+      .def("to_json", &cudaq::ChromeTraceBackend::toJson,
+           "Return captured events as a Chrome Trace Event Format JSON "
+           "string (same bytes the destructor would write to file).")
+      .def(
+          "to_dict",
+          [](cudaq::ChromeTraceBackend &self) {
+            auto jsonMod = nanobind::module_::import_("json");
+            return jsonMod.attr("loads")(self.toJson());
+          },
+          "Return captured events as a parsed Python dict.")
+      .def("write_file", &cudaq::ChromeTraceBackend::writeFile,
+           nanobind::arg("path") = nanobind::none(),
+           "Write the current buffer as JSON to `path`, or to the ctor path "
+           "if omitted. Non-destructive: the buffer stays intact and the "
+           "backend keeps capturing.")
+      .def("clear", &cudaq::ChromeTraceBackend::clear,
+           "Drop all buffered events.");
+
+  nanobind::class_<cudaq::SpdlogTraceBackend, cudaq::TraceBackend>(
+      trace, "SpdlogBackend")
+      .def(nanobind::new_(
+               [] { return std::make_shared<cudaq::SpdlogTraceBackend>(); }),
+           "Route trace events through spdlog. Output respects the current "
+           "CUDAQ log level.");
+
+  trace.def(
+      "set_backend",
+      [](std::shared_ptr<cudaq::TraceBackend> backend) {
+        auto &t = cudaq::Tracer::instance();
+        t.setBackend(std::move(backend));
+        t.setCaptureEnabled(true);
+      },
+      nanobind::arg("backend").none(false),
+      "Install a TraceBackend and enable span capture.");
+
+  trace.def(
+      "get_backend", [] { return cudaq::Tracer::instance().getBackend(); },
+      "Return the currently-installed TraceBackend, or None.");
+
+  trace.def(
+      "reset_backend",
+      [] {
+        auto &t = cudaq::Tracer::instance();
+        t.setBackend(nullptr);
+        t.setCaptureEnabled(false);
+      },
+      "Remove any installed backend and disable span capture. Subsequent "
+      "spans early-return without emitting.");
+}
diff --git a/python/runtime/cudaq/trace/py_trace.h b/python/runtime/cudaq/trace/py_trace.h
new file mode 100644
index 00000000000..7660e682d9e
--- /dev/null
+++ b/python/runtime/cudaq/trace/py_trace.h
@@ -0,0 +1,17 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace cudaq {
+
+void bindTrace(nanobind::module_ &mod);
+
+} // namespace cudaq
diff --git a/python/tests/utils/trace/test_trace_chrome.py b/python/tests/utils/trace/test_trace_chrome.py
new file mode 100644
index 00000000000..55915d65167
--- /dev/null
+++ b/python/tests/utils/trace/test_trace_chrome.py
@@ -0,0 +1,124 @@
+# ============================================================================ #
+# Copyright (c) 2026 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import json
+import subprocess
+import sys
+import textwrap
+
+import pytest
+
+cudaq = pytest.importorskip("cudaq")
+from cudaq.util import trace
+
+
+@cudaq.kernel
+def bell():
+    q = cudaq.qvector(2)
+    h(q[0])
+    x.ctrl(q[0], q[1])
+    mz(q)
+
+
+def test_file_backed_chrome_backend_interleaves_python_and_mlir_pass(tmp_path):
+    """A file-backed ChromeBackend captures the Python span plus every
+    MLIR pass that runs inside it, and every pass event is contained in the
+    outer Python span's time window."""
+    # Reset target in case a prior xdist test on this worker set a remote
+    # backend (e.g. backends/test_Quantinuum_*) and did not restore.
+    cudaq.set_target("qpp-cpu")
+    trace_path = tmp_path / "trace.json"
+
+    trace.set_backend(trace.ChromeBackend(str(trace_path)))
+    with trace.span("outer"):
+        cudaq.sample(bell, shots_count=10)
+    trace.reset_backend()
+
+    doc = json.loads(trace_path.read_text())
+    by_cat = {}
+    for e in doc["traceEvents"]:
+        by_cat.setdefault(e.get("cat"), []).append(e)
+
+    outer = next(e for e in by_cat.get("python", []) if e["name"] == "outer")
+    assert by_cat.get("mlir_pass"), "no mlir_pass events"
+
+    outer_end = outer["ts"] + outer.get("dur", 0)
+    for e in by_cat["mlir_pass"]:
+        assert outer["ts"] <= e["ts"] <= e["ts"] + e.get("dur", 0) <= outer_end
+
+
+def test_in_memory_chrome_backend_exposes_events_without_file():
+    """A ChromeBackend constructed with no path is pure in-memory: to_json()
+    and to_dict() agree, get_backend() round-trips, and the destructor must
+    not write any file."""
+    backend = trace.ChromeBackend()
+    trace.set_backend(backend)
+    assert trace.get_backend() is backend
+
+    with trace.span("inmem"):
+        pass
+    trace.reset_backend()
+
+    dict_form = backend.to_dict()
+    assert dict_form == json.loads(backend.to_json())
+    assert any(e["name"] == "inmem" and e["cat"] == "python"
+               for e in dict_form["traceEvents"])
+
+
+def test_builtin_python_phase_spans_wrap_kernel_lifecycle():
+    """Built-in @trace.traced decorators emit spans covering the Python
+    entry, JIT compile, and prepare_call / clone_module bridge phases."""
+    # Reset target in case a prior xdist test on this worker set a remote
+    # backend (e.g. backends/test_Quantinuum_*) and did not restore.
+    cudaq.set_target("qpp-cpu")
+
+    # Fresh kernel forces a JIT compile inside the traced region.
+    @cudaq.kernel
+    def fresh():
+        q = cudaq.qvector(1)
+        h(q[0])
+        mz(q)
+
+    backend = trace.ChromeBackend()
+    trace.set_backend(backend)
+    try:
+        cudaq.sample(fresh, shots_count=10)
+    finally:
+        trace.reset_backend()
+
+    events = backend.to_dict()["traceEvents"]
+    names = {e["name"] for e in events if e.get("cat") == "python"}
+    assert "cudaq.runtime.sample.sample" in names
+    assert "cudaq.kernel.kernel_decorator.PyKernelDecorator.compile" in names
+    assert "cudaq.kernel.kernel_decorator.PyKernelDecorator.prepare_call" in names
+    assert "kernel.clone_module" in names
+    # AST-bridge build span and the AOT pipeline marker emitted from
+    # compile_to_mlir. Tooling attributes per-pass events to AOT vs JIT by
+    # the cudaq.pipeline.* span ancestry, so a refactor that drops either
+    # name needs to update both this test and the contract.
+    assert "ast_bridge.build_module" in names
+    assert "cudaq.pipeline.aot" in names
+
+
+def test_file_backed_chrome_backend_writes_on_process_exit(tmp_path):
+    """A file-backed ChromeBackend installed in a subprocess writes its JSON
+    at process exit even without an explicit reset_backend / write_file /
+    del. Pins the shared_ptr shutdown-ordering contract."""
+    trace_path = tmp_path / "deferred.json"
+    script = textwrap.dedent(f"""
+        from cudaq.util import trace
+        trace.set_backend(trace.ChromeBackend({str(trace_path)!r}))
+        with trace.span("deferred"):
+            pass
+    """)
+    subprocess.run([sys.executable, "-c", script], check=True)
+
+    assert trace_path.exists()
+    doc = json.loads(trace_path.read_text())
+    assert any(e["name"] == "deferred" and e["cat"] == "python"
+               for e in doc["traceEvents"])
diff --git a/python/tests/utils/trace/test_trace_span.py b/python/tests/utils/trace/test_trace_span.py
new file mode 100644
index 00000000000..2d03ed4f1ca
--- /dev/null
+++ b/python/tests/utils/trace/test_trace_span.py
@@ -0,0 +1,60 @@
+# ============================================================================ #
+# Copyright (c) 2026 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import pytest
+
+pytest.importorskip("cudaq")
+from cudaq.util import trace
+
+
+def test_span_enters_and_exits_cleanly():
+    with trace.span("my_region"):
+        value = 1 + 1
+    assert value == 2
+
+
+def test_span_nested_does_not_raise():
+    with trace.span("outer"):
+        with trace.span("inner"):
+            pass
+
+
+def test_span_propagates_exception():
+    with pytest.raises(ValueError, match="inner failure"):
+        with trace.span("region"):
+            raise ValueError("inner failure")
+
+
+def test_span_accepts_arbitrary_kwargs():
+    with trace.span("region", key=1, other="x"):
+        pass
+
+
+def test_traced_decorator_emits_span_and_preserves_return():
+    """@traced wraps the call in a span, preserves metadata, and returns
+    the wrapped function's value. Exercises the captured-backend path so a
+    future regression in wrapper plumbing shows up immediately."""
+    backend = trace.ChromeBackend()
+    trace.set_backend(backend)
+
+    @trace.traced("traced.region")
+    def add(a, b):
+        """Add two numbers."""
+        return a + b
+
+    try:
+        assert add(2, 3) == 5
+    finally:
+        trace.reset_backend()
+
+    assert add.__name__ == "add"
+    assert add.__doc__ == "Add two numbers."
+    events = backend.to_dict()["traceEvents"]
+    assert any(
+        e.get("name") == "traced.region" and e.get("cat") == "python"
+        for e in events)
diff --git a/runtime/include/cudaq/runtime/logger/chrome_tracer.h b/runtime/include/cudaq/runtime/logger/chrome_tracer.h
index 9f8426a3871..7062b2bb462 100644
--- a/runtime/include/cudaq/runtime/logger/chrome_tracer.h
+++ b/runtime/include/cudaq/runtime/logger/chrome_tracer.h
@@ -9,13 +9,11 @@
 #pragma once
 
 #include "cudaq/runtime/logger/tracer.h"
-
-#include <sys/types.h>
-
 #include <cstdint>
 #include <mutex>
 #include <optional>
 #include <string>
+#include <sys/types.h>
 #include <vector>
 
 namespace cudaq {

From d830e74f27ec73c407aa9d0f76f8f69ebbf26266 Mon Sep 17 00:00:00 2001
From: Luca Mondada <72734770+lmondada@users.noreply.github.com>
Date: Thu, 30 Apr 2026 17:11:29 +0200
Subject: [PATCH 82/85] [NFC] Split out CompiledModule container into class
 (#4419)

NOTE: This is a re-post of #4392, which I merged into the wrong branch!
It's already been reviewed, discussed and approved.

---

This PR splits out the container that is used in CompiledModule into its
own type. This is so that it can be re-used by other upcoming types that
look very similar, e.g. KernelArgs.

I took the opportunity to change to using a vector of pairs instead of a
std::map to store the artifacts. This should be faster (most of the
time, there will be <5 artifacts) and means that several artifacts of
different types can share the same name. This removes the need to adopt
some naming convention to differentiate multiple artifact types for the
same kernel, as they can share the same name.

Signed-off-by: Luca Mondada <luca@mondada.net>
---
 runtime/common/BaseRemoteSimulatorQPU.h   |  4 +-
 runtime/common/CompiledModule.cpp         | 52 +++++++--------
 runtime/common/CompiledModule.h           | 36 ++++++-----
 runtime/common/NamedVariantStore.h        | 79 +++++++++++++++++++++++
 runtime/common/ServerHelper.h             |  4 +-
 runtime/internal/compiler/Compiler.cpp    | 20 ++----
 unittests/CMakeLists.txt                  |  1 +
 unittests/common/CompiledModuleTester.cpp | 65 +++++++++++++++++++
 8 files changed, 202 insertions(+), 59 deletions(-)
 create mode 100644 runtime/common/NamedVariantStore.h
 create mode 100644 unittests/common/CompiledModuleTester.cpp

diff --git a/runtime/common/BaseRemoteSimulatorQPU.h b/runtime/common/BaseRemoteSimulatorQPU.h
index 7761f5f017c..ad8b0c3f768 100644
--- a/runtime/common/BaseRemoteSimulatorQPU.h
+++ b/runtime/common/BaseRemoteSimulatorQPU.h
@@ -206,7 +206,7 @@ class BaseRemoteSimulatorQPU : public QPU {
               name, jit, {}, true);
       auto mlirArtifact =
           cudaq_internal::compiler::CompiledModuleHelper::createMlirArtifact(
-              name + ".mlir", moduleOp);
+              name, moduleOp);
       artifacts.push_back(mlirArtifact);
       return cudaq_internal::compiler::CompiledModuleHelper::
           createCompiledModule(name, resultInfo, std::move(artifacts));
@@ -214,7 +214,7 @@ class BaseRemoteSimulatorQPU : public QPU {
 
     auto mlirArtifact =
         cudaq_internal::compiler::CompiledModuleHelper::createMlirArtifact(
-            name + ".mlir", prefabMod);
+            name, prefabMod);
 
     return cudaq_internal::compiler::CompiledModuleHelper::createCompiledModule(
         name, resultInfo, {mlirArtifact});
diff --git a/runtime/common/CompiledModule.cpp b/runtime/common/CompiledModule.cpp
index 69fe55da0e0..256cac3d317 100644
--- a/runtime/common/CompiledModule.cpp
+++ b/runtime/common/CompiledModule.cpp
@@ -7,29 +7,31 @@
  ******************************************************************************/
 
 #include "CompiledModule.h"
-#include <stdexcept>
+#include <string_view>
 
 cudaq::CompiledModule::CompiledModule(std::string kernelName)
     : name(std::move(kernelName)) {}
 
 std::optional<cudaq::CompiledModule::JitArtifact>
-cudaq::CompiledModule::getJit(std::optional<std::string> jitName) const {
-  auto name = jitName.value_or(this->name);
-  auto it = artifacts.find(name);
-  if (it == artifacts.end())
-    return std::nullopt;
-  const auto *jit = std::get_if<JitArtifact>(&it->second);
-  return jit ? std::optional(*jit) : std::nullopt;
+cudaq::CompiledModule::getJit() const {
+  return getJit(name);
+}
+
+std::optional<cudaq::CompiledModule::JitArtifact>
+cudaq::CompiledModule::getJit(std::string_view jitName) const {
+  auto *jit = artifacts.get<JitArtifact>(jitName);
+  return jit ? std::optional<JitArtifact>{*jit} : std::nullopt;
 }
 
 std::optional<cudaq::CompiledModule::MlirArtifact>
-cudaq::CompiledModule::getMlir(std::optional<std::string> mlirName) const {
-  auto name = mlirName.value_or(this->name + ".mlir");
-  auto it = artifacts.find(name);
-  if (it == artifacts.end())
-    return std::nullopt;
-  const auto *mlir = std::get_if<MlirArtifact>(&it->second);
-  return mlir ? std::optional(*mlir) : std::nullopt;
+cudaq::CompiledModule::getMlir() const {
+  return getMlir(name);
+}
+
+std::optional<cudaq::CompiledModule::MlirArtifact>
+cudaq::CompiledModule::getMlir(std::string_view mlirName) const {
+  auto *mlir = artifacts.get<MlirArtifact>(mlirName);
+  return mlir ? std::optional<MlirArtifact>{*mlir} : std::nullopt;
 }
 
 bool cudaq::CompiledModule::isFullySpecialized() const {
@@ -51,21 +53,19 @@ std::optional<std::int64_t> cudaq::CompiledModule::getReturnOffset() const {
   return fn();
 }
 
-const cudaq::Resources *cudaq::CompiledModule::getResources(
-    std::optional<std::string> resourcesName) const {
-  auto name = resourcesName.value_or(this->name + ".resources");
-  auto it = artifacts.find(name);
-  if (it == artifacts.end())
-    return nullptr;
-  const auto *resources = std::get_if<ResourcesArtifact>(&it->second);
-  return resources ? &resources->getResources() : nullptr;
+const cudaq::Resources *cudaq::CompiledModule::getResources() const {
+  return getResources(name);
+}
+
+const cudaq::Resources *
+cudaq::CompiledModule::getResources(std::string_view resourcesName) const {
+  auto *res = artifacts.get<ResourcesArtifact>(resourcesName);
+  return res ? &res->getResources() : nullptr;
 }
 
 void cudaq::CompiledModule::addArtifact(std::string name,
                                         CompiledArtifact artifact) {
-  if (artifacts.contains(name))
-    throw std::runtime_error("Artifact with name " + name + " already exists");
-  artifacts.emplace(std::move(name), std::move(artifact));
+  artifacts.add(std::move(name), std::move(artifact));
 }
 
 void (*cudaq::CompiledModule::JitArtifact::getFn() const)() { return fn; }
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index 52d8688ed71..4c8ebef2004 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -7,16 +7,16 @@
  ******************************************************************************/
 #pragma once
 
+#include "common/NamedVariantStore.h"
 #include "common/Resources.h"
 #include "common/ThunkInterface.h"
 #include <cstddef>
 #include <cstdint>
 #include <functional>
-#include <map>
 #include <memory>
 #include <optional>
 #include <string>
-#include <variant>
+#include <string_view>
 #include <vector>
 
 // This header file and the types defined within are designed to have no
@@ -153,8 +153,9 @@ class CompiledModule {
   };
 
   /// A compiled artifact is a JIT binary, an MLIR module, or resource metrics.
-  using CompiledArtifact =
-      std::variant<JitArtifact, MlirArtifact, ResourcesArtifact>;
+  using ArtifactsStore =
+      detail::NamedVariantStore<JitArtifact, MlirArtifact, ResourcesArtifact>;
+  using CompiledArtifact = ArtifactsStore::Value;
 
   // --- Compilation metadata ---
 
@@ -169,24 +170,27 @@ class CompiledModule {
   /// Get the JIT artifact with the given name.
   ///
   /// If no name is provided, defaults to the kernel name.
-  std::optional<JitArtifact>
-  getJit(std::optional<std::string> jitName = std::nullopt) const;
+  std::optional<JitArtifact> getJit() const;
+  std::optional<JitArtifact> getJit(std::string_view jitName) const;
 
   /// Get the MLIR artifact with the given name.
   ///
-  /// If no name is provided, defaults to `kernel_name + ".mlir"`.
-  std::optional<MlirArtifact>
-  getMlir(std::optional<std::string> mlirName = std::nullopt) const;
+  /// If no name is provided, defaults to the kernel name.
+  std::optional<MlirArtifact> getMlir() const;
+  std::optional<MlirArtifact> getMlir(std::string_view mlirName) const;
 
   /// Get the pre-computed resource counts, or `nullptr` if it does not exist.
   ///
-  /// If no name is provided, defaults to `kernel_name + ".resources"`.
-  const Resources *
-  getResources(std::optional<std::string> resourcesName = std::nullopt) const;
+  /// If no name is provided, defaults to the kernel name.
+  const Resources *getResources() const;
+  const Resources *getResources(std::string_view resourcesName) const;
+
+  /// Get all compiled artifacts in insertion order.
+  const ArtifactsStore &getArtifacts() const { return artifacts; }
 
-  /// Get all compiled artifacts.
-  const std::map<std::string, CompiledArtifact> &getArtifacts() const {
-    return artifacts;
+  /// Get all MLIR artifacts in insertion order.
+  auto getMlirArtifacts() const {
+    return artifacts.getAllOfType<MlirArtifact>();
   }
 
   /// Whether the kernel is fully specialized (all arguments inlined).
@@ -225,7 +229,7 @@ class CompiledModule {
                          // signature here. Though I'm not sure what MLIR
                          // agnostic information is worth storing.
   CompilationMetadata metadata;
-  std::map<std::string, CompiledArtifact> artifacts;
+  ArtifactsStore artifacts;
 };
 
 } // namespace cudaq
diff --git a/runtime/common/NamedVariantStore.h b/runtime/common/NamedVariantStore.h
new file mode 100644
index 00000000000..2fbe164e4ff
--- /dev/null
+++ b/runtime/common/NamedVariantStore.h
@@ -0,0 +1,79 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#pragma once
+
+#include <algorithm>
+#include <ranges>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+#include <vector>
+
+namespace cudaq::detail {
+
+/// A container of named heterogeneous name-value pairs, retrievable by name and
+/// value type.
+template <typename... Ts>
+class NamedVariantStore {
+public:
+  using Value = std::variant<Ts...>;
+  using Entry = std::pair<std::string, Value>;
+  using Storage = std::vector<Entry>;
+  using const_iterator = Storage::const_iterator;
+
+  /// Get the value with the given name and type `T`.
+  template <typename T>
+  const T *get(std::string_view name) const {
+    for (const auto &[valName, val] : entries) {
+      if (valName != name)
+        continue;
+      if (const auto *typedVal = std::get_if<T>(&val))
+        return typedVal;
+    }
+    return nullptr;
+  }
+
+  /// Range over all entries of type `T` in insertion order.
+  template <typename T>
+  auto getAllOfType() const {
+    return std::views::all(entries) |
+           std::views::filter([](const Entry &entry) {
+             return std::holds_alternative<T>(entry.second);
+           }) |
+           std::views::transform(
+               [](const Entry &entry)
+                   -> std::pair<const std::string &, const T &> {
+                 return {entry.first, *std::get_if<T>(&entry.second)};
+               });
+  }
+
+  /// Add an entry with the given name and value.
+  ///
+  /// Throws a `std::runtime_error` if an entry with the same name and type
+  /// already exists.
+  void add(std::string name, Value value) {
+    const auto duplicate =
+        std::ranges::any_of(entries, [&](const Entry &entry) {
+          return entry.first == name && entry.second.index() == value.index();
+        });
+    if (duplicate)
+      throw std::runtime_error("Value with same type and name '" + name +
+                               "' already exists");
+    entries.emplace_back(std::move(name), std::move(value));
+  }
+
+  const_iterator begin() const { return entries.begin(); }
+  const_iterator end() const { return entries.end(); }
+
+private:
+  Storage entries;
+};
+
+} // namespace cudaq::detail
diff --git a/runtime/common/ServerHelper.h b/runtime/common/ServerHelper.h
index 1e5a9438198..0f22c4c0381 100644
--- a/runtime/common/ServerHelper.h
+++ b/runtime/common/ServerHelper.h
@@ -36,13 +36,13 @@ struct KernelExecution {
   nlohmann::json output_names;
   std::vector<std::size_t> mapping_reorder_idx;
   nlohmann::json user_data;
-  KernelExecution(std::string &n, std::string &c,
+  KernelExecution(const std::string &n, const std::string &c,
                   std::optional<cudaq::JitEngine> jit,
                   std::optional<Resources> rc, nlohmann::json &o,
                   std::vector<std::size_t> &m)
       : name(n), code(c), jit(jit), resourceCounts(rc), output_names(o),
         mapping_reorder_idx(m) {}
-  KernelExecution(std::string &n, std::string &c,
+  KernelExecution(const std::string &n, const std::string &c,
                   std::optional<cudaq::JitEngine> jit,
                   std::optional<Resources> rc, nlohmann::json &o,
                   std::vector<std::size_t> &m, nlohmann::json &ud)
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index 46595055736..369790b314d 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -385,7 +385,7 @@ cudaq::CompiledModule Compiler::assembleCompiledModule(
       artifacts.push_back(std::move(jitArtifacts[0]));
       if (resourceCounts)
         artifacts.push_back(CompiledModuleHelper::createResourcesArtifact(
-            name + ".resources", std::move(*resourceCounts)));
+            name, std::move(*resourceCounts)));
     }
   }
 
@@ -394,9 +394,8 @@ cudaq::CompiledModule Compiler::assembleCompiledModule(
       applyPipeline("func.func(combine-measurements)", module, kernelName);
 
   for (auto &[name, module] : modules) {
-    auto mlirName = name + ".mlir";
     artifacts.push_back(
-        CompiledModuleHelper::createMlirArtifact(mlirName, module, context));
+        CompiledModuleHelper::createMlirArtifact(name, module, context));
   }
 
   return CompiledModuleHelper::createCompiledModule(
@@ -547,11 +546,7 @@ Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
 
   // Apply user-specified codegen
   std::vector<cudaq::KernelExecution> codes;
-  for (auto &[name, artifact] : compiled.getArtifacts()) {
-    if (!name.ends_with(".mlir"))
-      continue;
-    auto &mlirArtifact =
-        std::get<cudaq::CompiledModule::MlirArtifact>(artifact);
+  for (const auto &[name, mlirArtifact] : compiled.getMlirArtifacts()) {
     auto moduleOpI = CompiledModuleHelper::getMlirModuleOp(mlirArtifact);
 
     std::string codeStr;
@@ -577,17 +572,16 @@ Compiler::emitKernelExecutions(const cudaq::CompiledModule &compiled) {
     // Retrieve pre-computed JIT engine and resource counts (if any).
     std::optional<cudaq::JitEngine> optionalJit;
     std::optional<cudaq::Resources> optionalResourceCounts;
-    auto kernelName = name.substr(0, name.length() - 5);
-    auto jit = compiled.getJit(kernelName);
+    auto jit = compiled.getJit(name);
     if (jit)
       optionalJit = jit->getEngine();
-    auto resourceCounts = compiled.getResources(kernelName + ".resources");
+    auto resourceCounts = compiled.getResources(name);
     if (resourceCounts)
       optionalResourceCounts = *resourceCounts;
 
     auto mapping_reorder_idx = compiled.getMetadata().reorderIdx;
-    codes.emplace_back(kernelName, codeStr, optionalJit, optionalResourceCounts,
-                       j, mapping_reorder_idx);
+    codes.emplace_back(name, codeStr, optionalJit, optionalResourceCounts, j,
+                       mapping_reorder_idx);
   }
 
   return codes;
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 01553216cf9..3757859d2fd 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -51,6 +51,7 @@ set(CUDAQ_RUNTIME_TEST_SOURCES
   integration/kernels_tester.cpp
   common/MeasureCountsTester.cpp
   common/NoiseModelTester.cpp
+  common/CompiledModuleTester.cpp
   common/ExecutionContextThreadTester.cpp
   ptsbe/PTSBESampleTester.cpp
   ptsbe/PTSBEMultiBackendTester.cpp
diff --git a/unittests/common/CompiledModuleTester.cpp b/unittests/common/CompiledModuleTester.cpp
new file mode 100644
index 00000000000..b2ea565f97d
--- /dev/null
+++ b/unittests/common/CompiledModuleTester.cpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "common/NamedVariantStore.h"
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+using Store = cudaq::detail::NamedVariantStore<int, double, std::string>;
+
+TEST(NamedVariantStoreTester, GetFindsMatchingTypeAfterSameNamedEntry) {
+  Store store;
+  store.add("artifact", 3.14);
+  store.add("artifact", std::string("payload"));
+  store.add("other", 7);
+
+  const auto *stringValue = store.get<std::string>("artifact");
+  ASSERT_NE(stringValue, nullptr);
+  EXPECT_EQ(*stringValue, "payload");
+
+  const auto *doubleValue = store.get<double>("artifact");
+  ASSERT_NE(doubleValue, nullptr);
+  EXPECT_DOUBLE_EQ(*doubleValue, 3.14);
+
+  EXPECT_EQ(store.get<int>("artifact"), nullptr);
+}
+
+TEST(NamedVariantStoreTester, RejectsDuplicateNameAndTypeButAllowsOthers) {
+  Store store;
+  store.add("artifact", 7);
+
+  try {
+    store.add("artifact", 9);
+    FAIL() << "Expected duplicate name/type insertion to throw.";
+  } catch (const std::runtime_error &err) {
+    EXPECT_NE(std::string(err.what()).find("artifact"), std::string::npos);
+  }
+
+  EXPECT_NO_THROW(store.add("artifact", 2.5));
+  EXPECT_NO_THROW(store.add("other", 9));
+}
+
+TEST(NamedVariantStoreTester, GetAllOfTypePreservesInsertionOrder) {
+  Store store;
+  store.add("alpha", std::string("a"));
+  store.add("skip", 1);
+  store.add("beta", std::string("b"));
+  store.add("skip-again", 2.0);
+  store.add("gamma", std::string("c"));
+
+  std::vector<std::string> names;
+  std::vector<std::string> values;
+  for (const auto &[name, value] : store.getAllOfType<std::string>()) {
+    names.push_back(name);
+    values.push_back(value);
+  }
+
+  EXPECT_EQ(names, (std::vector<std::string>{"alpha", "beta", "gamma"}));
+  EXPECT_EQ(values, (std::vector<std::string>{"a", "b", "c"}));
+}

From ee3ea513b24eb98d29df9277dd878eca1bfbe94a Mon Sep 17 00:00:00 2001
From: Pradnya Khalate <148914294+khalatepradnya@users.noreply.github.com>
Date: Thu, 30 Apr 2026 13:20:30 -0700
Subject: [PATCH 83/85] [opt][qir] Lower `!cc.measure_handle` through
 `ConvertToQIRAPI` (#4404)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
* Lower `!cc.measure_handle` to its `i64` payload through
`--convert-to-qir-api`'s existing `TypeConverter`, completing the
IR-side of the `cudaq::measure_handle` feature.
* Builds on #4403.

## Motivation
#4403 introduced `!cc.measure_handle` as IR vocabulary; nothing yet
routes it to QIR. This PR adds the converter rule plus boundary bridging
on `quake.mz` (which still calls a QIR function returning `Result*`) and
`quake.discriminate` (which still consumes `Result*`), so handle-form
kernels reach the QIR pipeline as `i64` payloads through the same
`TypeConverter` machinery the rest of QIR conversion already uses.

## What Changed
- **`QIRAPITypeConverter`** gains three `addConversion` rules:
`!cc.measure_handle -> i64`, plus recursive descent through
`!cc.array<...>` and `!cc.stdvec<...>` so container-shaped function
signatures, allocations, and pointers see consistent post-conversion
element types. The `!cc.ptr<...>` recursion was already in place.
- **`MeasurementOpPattern`** detects when the original `quake.mz`
produced a handle (its `measOut` is `!cc.measure_handle`) and emits a
`cc.cast Result* -> i64` so downstream uses see the converted payload.
The cast is materialized in the mz call's block, ahead of the optional
terminator-relative insertion point used for record-output, so it
dominates downstream `quake.discriminate` uses.
- **`DiscriminateOpToCallRewrite`** mirrors this on the read side: when
the post-conversion operand is integer-typed it emits `cc.cast i64 ->
Result*` before delegating to the existing read-result lowering. In the
full-QIR (`!discriminateToClassical`) branch the bridge cast and the
inner double-cast fold against each other, leaving a single `cc.cast i64
-> !cc.ptr<i1>` + `cc.load`.
- **`ExpandMeasurements`** accepts `!cc.measure_handle` alongside
`!quake.measure` in `usesIndividualQubit`, so single-qubit handle
measurements aren't rewritten as registers.
- **Predicate rename**: the misnamed `hasQuakeType` is now
`needsTypeConversion`, leaf check extended to include
`!cc.measure_handle`, recursion extended to descend through
`!cc.array`/`!cc.stdvec`. The old name was incorrect — it has always
reported "this op carries a type the converter rewrites," not "this op
carries a Quake type."
- **Test**: `test/Transforms/qir_api_measure_handle.qke` covering scalar
handle measurement + discriminate, function signature with handle
parameter and return, `cc.alloca` of a scalar handle, static- and
dynamic-size arrays of handles, `cc.stdvec<!cc.measure_handle>` in a
function signature, `cc.indirect_callable<() -> !cc.measure_handle>`,
and a no-handle negative.


## Risks
- `cc.loop` iter-args carrying `!cc.measure_handle` are not exercised by
the conversion's region-aware patterns. Low immediate risk because no
current frontend or test produces such IR; flagged as a follow-up.
- Container types beyond `cc.array`/`cc.stdvec` (e.g., a `cc.struct`
with a handle field) are not in the converter's recursion. None of the
current frontends produce these; not a regression vs. the prototype.

## Downstream Impact
- CUDA-QX: none.
- Public API: none.
- Stack: the next PR adds C++/Python frontend bindings that produce
handle-form IR, which this PR now correctly routes.

---------

Signed-off-by: Pradnya Khalate <pkhalate@nvidia.com>
Signed-off-by: Pradnya Khalate <148914294+khalatepradnya@users.noreply.github.com>
---
 lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp     | 124 ++++++++++++++----
 .../Transforms/ExpandMeasurements.cpp         |   7 +-
 test/Transforms/qir_api_measure_handle.qke    | 120 +++++++++++++++++
 3 files changed, 220 insertions(+), 31 deletions(-)
 create mode 100644 test/Transforms/qir_api_measure_handle.qke

diff --git a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
index bbf790a0c37..310de98707b 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
@@ -149,6 +149,31 @@ struct QIRAPITypeConverter : public TypeConverter {
     addConversion(
         [&](quake::MeasureType ty) { return getResultType(ty.getContext()); });
     addConversion([&](quake::StruqType ty) { return convertStruqType(ty); });
+    // `!cc.measure_handle` is the IR alias of `cudaq::measure_handle`. The
+    // QIR API models classical measurement results as `Result*` (legacy
+    // `quake.measure`) or as opaque `i64` payloads (handle form). The
+    // measurement / discriminate patterns bridge `Result*` to/from `i64`
+    // when the original op carried a handle.
+    addConversion([](cudaq::cc::MeasureHandleType ty) -> Type {
+      return IntegerType::get(ty.getContext(), 64);
+    });
+    // Recursively convert handle / quake types nested in CC array and
+    // stdvec types so that container-shaped function signatures, allocations,
+    // and pointers see consistent post-conversion element types.
+    addConversion([&](cudaq::cc::ArrayType ty) -> Type {
+      Type newEleTy = convertType(ty.getElementType());
+      if (newEleTy == ty.getElementType())
+        return ty;
+      if (ty.isUnknownSize())
+        return cudaq::cc::ArrayType::get(newEleTy);
+      return cudaq::cc::ArrayType::get(ty.getContext(), newEleTy, ty.getSize());
+    });
+    addConversion([&](cudaq::cc::StdvecType ty) -> Type {
+      Type newEleTy = convertType(ty.getElementType());
+      if (newEleTy == ty.getElementType())
+        return ty;
+      return cudaq::cc::StdvecType::get(ty.getContext(), newEleTy);
+    });
   }
 
   Type convertFunctionType(FunctionType ty) {
@@ -742,22 +767,31 @@ struct DiscriminateOpToCallRewrite
   LogicalResult
   matchAndRewrite(quake::DiscriminateOp disc, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    auto loc = disc.getLoc();
+    // Handle-form callers feed `quake.discriminate` an `i64` payload (the
+    // converted form of `!cc.measure_handle`). Restore the `Result*` view
+    // expected by the QIR read-result functions.
+    SmallVector<Value> operands{adaptor.getOperands().begin(),
+                                adaptor.getOperands().end()};
+    if (operands.size() == 1 && isa<IntegerType>(operands.front().getType())) {
+      auto resultTy = M::getResultType(rewriter.getContext());
+      operands.front() =
+          rewriter.create<cudaq::cc::CastOp>(loc, resultTy, operands.front());
+    }
     if constexpr (M::discriminateToClassical) {
       if constexpr (M::qirVersion == QirVersion::version_1_0) {
         rewriter.replaceOpWithNewOp<func::CallOp>(
             disc, rewriter.getI1Type(), cudaq::opt::qir1_0::ReadResult,
-            adaptor.getOperands());
+            operands);
       } else {
         rewriter.replaceOpWithNewOp<func::CallOp>(
             disc, rewriter.getI1Type(), cudaq::opt::qir0_1::ReadResultBody,
-            adaptor.getOperands());
+            operands);
       }
     } else {
-      auto loc = disc.getLoc();
       // NB: the double cast here is to avoid folding the pointer casts.
       auto i64Ty = rewriter.getI64Type();
-      auto unu =
-          rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, adaptor.getOperands());
+      auto unu = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, operands);
       auto ptrI1Ty = cudaq::cc::PointerType::get(rewriter.getI1Type());
       auto du = rewriter.create<cudaq::cc::CastOp>(loc, ptrI1Ty, unu);
       rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, du);
@@ -1300,6 +1334,13 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
                             adaptor.getTargets().end()};
     auto functionName = M::getQIRMeasure();
 
+    // Handle-form measurements produce a `!cc.measure_handle` SSA value
+    // whose converted type is `i64`. The QIR measurement function still
+    // returns `Result*`, so we bridge the call's `Result*` result to the
+    // converted `i64` payload via `cc.cast`.
+    const bool measOutIsHandle =
+        isa<cudaq::cc::MeasureHandleType>(mz.getMeasOut().getType());
+
     // Are we using the measurement that returns a result?
     if constexpr (M::mzReturnsResultType) {
       // Yes, the measurement results the result, so we can use a
@@ -1317,8 +1358,14 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
       auto call =
           rewriter.create<func::CallOp>(loc, resultTy, functionName, args);
       auto assundry = filterArgs(mz, adaptor.getTargets());
-      SmallVector<Value> replaceVals{call.getResults().begin(),
-                                     call.getResults().end()};
+      SmallVector<Value> replaceVals;
+      if (measOutIsHandle) {
+        auto i64Ty = rewriter.getI64Type();
+        replaceVals.push_back(
+            rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, call.getResult(0)));
+      } else {
+        replaceVals.append(call.getResults().begin(), call.getResults().end());
+      }
       replaceVals.append(assundry.begin(), assundry.end());
       rewriter.replaceOp(mz, replaceVals);
       call->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
@@ -1341,6 +1388,15 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
       auto call =
           rewriter.create<func::CallOp>(loc, TypeRange{}, functionName, args);
       call->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
+      // For handle-form callers, materialize the back-cast `Result* -> i64`
+      // here so it dominates downstream uses. The `!discriminateToClassical`
+      // branch below moves the insertion point to the block terminator for
+      // the record-output call, after which a cast would not dominate.
+
+      auto i64Ty = rewriter.getI64Type();
+      Value handleRes =
+          measOutIsHandle ? rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, res)
+                          : res;
       auto cstringGlobal =
           createGlobalCString(mz, loc, rewriter, regNameAttr.getValue());
       if constexpr (!M::discriminateToClassical) {
@@ -1358,7 +1414,7 @@ struct MeasurementOpPattern : public OpConversionPattern<quake::MzOp> {
         recOut->setAttr(cudaq::opt::ResultIndexAttrName, resultAttr);
         recOut->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
       }
-      SmallVector<Value> results = {res};
+      SmallVector<Value> results = {handleRes};
       auto assundry = filterArgs(mz, adaptor.getTargets());
       results.append(assundry.begin(), assundry.end());
       rewriter.replaceOp(mz, results);
@@ -2190,49 +2246,49 @@ struct QuakeToQIRAPIPass
                              cudaq::codegen::CodeGenDialect>();
     target.addLegalOp<cudaq::codegen::MaterializeConstantArrayOp>();
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp fn) {
-      return !hasQuakeType(fn.getFunctionType()) &&
+      return !needsTypeConversion(fn.getFunctionType()) &&
              (!fn->hasAttr(cudaq::kernelAttrName) || fn->hasAttr(FuncIsQIRAPI));
     });
     target.addDynamicallyLegalOp<func::ConstantOp>([&](func::ConstantOp op) {
-      return !hasQuakeType(op.getResult().getType());
+      return !needsTypeConversion(op.getResult().getType());
     });
     target.addDynamicallyLegalOp<cudaq::cc::UndefOp, cudaq::cc::PoisonOp>(
         [&](Operation *op) {
-          return !hasQuakeType(op->getResult(0).getType());
+          return !needsTypeConversion(op->getResult(0).getType());
         });
     target.addDynamicallyLegalOp<cudaq::cc::CallableFuncOp>(
         [&](cudaq::cc::CallableFuncOp op) {
-          return !hasQuakeType(op.getFunction().getType());
+          return !needsTypeConversion(op.getFunction().getType());
         });
     target.addDynamicallyLegalOp<cudaq::cc::CreateLambdaOp>(
         [&](cudaq::cc::CreateLambdaOp op) {
-          return !hasQuakeType(op.getSignature().getType());
+          return !needsTypeConversion(op.getSignature().getType());
         });
     target.addDynamicallyLegalOp<cudaq::cc::InstantiateCallableOp>(
         [&](cudaq::cc::InstantiateCallableOp op) {
           for (auto d : op.getClosureData())
-            if (hasQuakeType(d.getType()))
+            if (needsTypeConversion(d.getType()))
               return false;
-          return !hasQuakeType(op.getSignature().getType());
+          return !needsTypeConversion(op.getSignature().getType());
         });
     target.addDynamicallyLegalOp<cudaq::cc::CallableClosureOp>(
         [&](cudaq::cc::CallableClosureOp op) {
           for (auto ty : op.getResultTypes())
-            if (hasQuakeType(ty))
+            if (needsTypeConversion(ty))
               return false;
-          return !hasQuakeType(op.getCallable().getType());
+          return !needsTypeConversion(op.getCallable().getType());
         });
     target.addDynamicallyLegalOp<cudaq::cc::AllocaOp>(
         [&](cudaq::cc::AllocaOp op) {
-          return !hasQuakeType(op.getElementType());
+          return !needsTypeConversion(op.getElementType());
         });
     target.addDynamicallyLegalOp<arith::SelectOp>([&](arith::SelectOp op) {
-      return !hasQuakeType(op.getResult().getType());
+      return !needsTypeConversion(op.getResult().getType());
     });
     target.addDynamicallyLegalOp<cf::BranchOp, cf::CondBranchOp>(
         [&](Operation *op) {
           for (auto opnd : op->getOperands())
-            if (hasQuakeType(opnd.getType()))
+            if (needsTypeConversion(opnd.getType()))
               return false;
           return true;
         });
@@ -2243,10 +2299,10 @@ struct QuakeToQIRAPIPass
         cudaq::cc::CastOp, cudaq::cc::FuncToPtrOp, cudaq::cc::StoreOp,
         cudaq::cc::LoadOp>([&](Operation *op) {
       for (auto opnd : op->getOperands())
-        if (hasQuakeType(opnd.getType()))
+        if (needsTypeConversion(opnd.getType()))
           return false;
       for (auto res : op->getResults())
-        if (hasQuakeType(res.getType()))
+        if (needsTypeConversion(res.getType()))
           return false;
       return true;
     });
@@ -2256,23 +2312,33 @@ struct QuakeToQIRAPIPass
     LLVM_DEBUG(llvm::dbgs() << "After QIR API conversion:\n" << *op << '\n');
   }
 
-  static bool hasQuakeType(Type ty) {
+  // Returns true iff `ty` (or some type nested inside it) requires conversion
+  // by `QIRAPITypeConverter`. The recursion descends through CC container
+  // types that the converter rewrites (`cc.ptr`, `cc.callable`,
+  // `cc.indirect_callable`, function types, `cc.array`, `cc.stdvec`) and the
+  // leaf check covers Quake types and `!cc.measure_handle` (the IR alias of
+  // `cudaq::measure_handle`, lowered to `i64`).
+  static bool needsTypeConversion(Type ty) {
     if (auto pty = dyn_cast<cudaq::cc::PointerType>(ty))
-      return hasQuakeType(pty.getElementType());
+      return needsTypeConversion(pty.getElementType());
     if (auto cty = dyn_cast<cudaq::cc::CallableType>(ty))
-      return hasQuakeType(cty.getSignature());
+      return needsTypeConversion(cty.getSignature());
     if (auto cty = dyn_cast<cudaq::cc::IndirectCallableType>(ty))
-      return hasQuakeType(cty.getSignature());
+      return needsTypeConversion(cty.getSignature());
+    if (auto aty = dyn_cast<cudaq::cc::ArrayType>(ty))
+      return needsTypeConversion(aty.getElementType());
+    if (auto sty = dyn_cast<cudaq::cc::StdvecType>(ty))
+      return needsTypeConversion(sty.getElementType());
     if (auto fty = dyn_cast<FunctionType>(ty)) {
       for (auto t : fty.getInputs())
-        if (hasQuakeType(t))
+        if (needsTypeConversion(t))
           return true;
       for (auto t : fty.getResults())
-        if (hasQuakeType(t))
+        if (needsTypeConversion(t))
           return true;
       return false;
     }
-    return quake::isQuakeType(ty);
+    return quake::isQuakeType(ty) || isa<cudaq::cc::MeasureHandleType>(ty);
   }
 
   void runOnOperation() override {
diff --git a/lib/Optimizer/Transforms/ExpandMeasurements.cpp b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
index 1527608dca0..17682471337 100644
--- a/lib/Optimizer/Transforms/ExpandMeasurements.cpp
+++ b/lib/Optimizer/Transforms/ExpandMeasurements.cpp
@@ -18,10 +18,13 @@
 
 using namespace mlir;
 
-// Only an individual qubit measurement returns a bool.
+// Only an individual qubit measurement returns a scalar token. Both
+// `!quake.measure` (legacy `bool`/`Result*` token) and `!cc.measure_handle`
+// (the IR alias of `cudaq::measure_handle`, an `i64` payload) are scalar
+// per-qubit measurement results, so neither requires expansion to a register.
 template <typename A>
 bool usesIndividualQubit(A x) {
-  return x.getType() == quake::MeasureType::get(x.getContext());
+  return isa<quake::MeasureType, cudaq::cc::MeasureHandleType>(x.getType());
 }
 
 // Generalized pattern for expanding a multiple qubit measurement (whether it is
diff --git a/test/Transforms/qir_api_measure_handle.qke b/test/Transforms/qir_api_measure_handle.qke
new file mode 100644
index 00000000000..28d88f29c1b
--- /dev/null
+++ b/test/Transforms/qir_api_measure_handle.qke
@@ -0,0 +1,120 @@
+// ========================================================================== //
+// Copyright (c) 2026 NVIDIA Corporation & Affiliates.                        //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt --convert-to-qir-api %s | FileCheck %s
+
+// Verifies that `--convert-to-qir-api` lowers `!cc.measure_handle` -- the IR
+// alias of `cudaq::measure_handle` -- to its `i64` payload through the
+// `QIRAPITypeConverter`, including bridging casts on `quake.mz` (which still
+// returns `Result*`) and `quake.discriminate` (which still consumes `Result*`).
+
+// -----
+// Scalar handle measurement (full-QIR `mzReturnsResultType` branch).
+
+func.func @scalar_handle() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+  %0 = quake.alloca !quake.ref
+  %m = quake.mz %0 name "h" : (!quake.ref) -> !cc.measure_handle
+  %b = quake.discriminate %m : (!cc.measure_handle) -> i1
+  return %b : i1
+}
+
+// CHECK-LABEL:   func.func @scalar_handle() -> i1
+// CHECK:           %[[VAL_Q:.*]] = call @__quantum__rt__qubit_allocate()
+// CHECK:           %[[VAL_R:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_Q]], {{%.*}}) {{.*}} -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// `MeasurementOpPattern` casts the QIR call's `Result*` to the converted
+// `i64` payload (the handle).
+// CHECK:           %[[VAL_I:.*]] = cc.cast %[[VAL_R]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> i64
+// `DiscriminateOpToCallRewrite` (full-QIR / `!discriminateToClassical`) casts
+// the i64 to `!cc.ptr<i1>` and loads. The transient `i64 -> Result* -> i64`
+// round-trip introduced by the handle bridge folds out, leaving a single
+// `cc.cast` from i64.
+// CHECK:           %[[VAL_P:.*]] = cc.cast %[[VAL_I]] : (i64) -> !cc.ptr<i1>
+// CHECK:           %[[VAL_B:.*]] = cc.load %[[VAL_P]] : !cc.ptr<i1>
+// CHECK:           return %[[VAL_B]] : i1
+
+// -----
+// Function signatures with handle parameter and return.
+
+func.func @handle_signature(%arg0: !cc.measure_handle) -> !cc.measure_handle attributes {"cudaq-kernel"} {
+  return %arg0 : !cc.measure_handle
+}
+
+// CHECK-LABEL:   func.func @handle_signature(
+// CHECK-SAME:      %[[VAL_0:.*]]: i64) -> i64
+// CHECK:           return %[[VAL_0]] : i64
+
+// -----
+// `cc.alloca` of a scalar handle.
+
+func.func @handle_alloca() attributes {"cudaq-kernel"} {
+  %0 = cc.alloca !cc.measure_handle
+  return
+}
+
+// CHECK-LABEL:   func.func @handle_alloca()
+// CHECK:           cc.alloca i64
+
+// -----
+// Static-size array of handles.
+
+func.func @handle_static_array() attributes {"cudaq-kernel"} {
+  %0 = cc.alloca !cc.array<!cc.measure_handle x 5>
+  return
+}
+
+// CHECK-LABEL:   func.func @handle_static_array()
+// CHECK:           cc.alloca !cc.array<i64 x 5>
+
+// -----
+// Dynamic-size array of handles.
+
+func.func @handle_dynamic_array(%n: i64) attributes {"cudaq-kernel"} {
+  %0 = cc.alloca !cc.measure_handle[%n : i64]
+  return
+}
+
+// CHECK-LABEL:   func.func @handle_dynamic_array(
+// CHECK-SAME:      %[[VAL_N:.*]]: i64)
+// CHECK:           cc.alloca i64{{\[}}%[[VAL_N]] : i64]
+
+// -----
+// Function returning `cc.stdvec<measure_handle>` (recursive container rule).
+
+func.func @stdvec_handle(%arg: !cc.stdvec<!cc.measure_handle>) -> !cc.stdvec<!cc.measure_handle> attributes {"cudaq-kernel"} {
+  return %arg : !cc.stdvec<!cc.measure_handle>
+}
+
+// CHECK-LABEL:   func.func @stdvec_handle(
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.stdvec<i64>) -> !cc.stdvec<i64>
+// CHECK:           return %[[VAL_0]] : !cc.stdvec<i64>
+
+// -----
+// Negative: a kernel without any handle types is left alone.
+
+func.func @no_handle() attributes {"cudaq-kernel"} {
+  %0 = cc.alloca i64
+  return
+}
+
+// CHECK-LABEL:   func.func @no_handle()
+// CHECK:           cc.alloca i64
+
+// -----
+// `cc.indirect_callable` carrying a handle return type.
+
+func.func @indirect_handle(%c: !cc.indirect_callable<() -> !cc.measure_handle>)
+    -> !cc.measure_handle attributes {"cudaq-kernel"} {
+  %0 = cc.call_indirect_callable %c
+    : (!cc.indirect_callable<() -> !cc.measure_handle>) -> !cc.measure_handle
+  return %0 : !cc.measure_handle
+}
+
+// CHECK-LABEL:   func.func @indirect_handle(
+// CHECK-SAME:      %[[VAL_0:.*]]: !cc.indirect_callable<() -> i64>) -> i64
+// CHECK:           %[[VAL_1:.*]] = cc.call_indirect_callable %[[VAL_0]] : (!cc.indirect_callable<() -> i64>) -> i64
+// CHECK:           return %[[VAL_1]] : i64

From c90d001a6c9ee219dd257ef1fd5e14c121549075 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 30 Apr 2026 13:47:30 -0700
Subject: [PATCH 84/85] Bump notebook from 7.5.2 to 7.5.6 (#4420)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [notebook](https://github.com/jupyter/notebook) from 7.5.2 to
7.5.6.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/jupyter/notebook/releases">notebook's
releases</a>.</em></p>
<blockquote>
<h2>v7.5.6</h2>
<h2>7.5.6</h2>
<p>(<a
href="https://github.com/jupyter/notebook/compare/@jupyter-notebook/application-extension@7.5.5...2e642f0cb10be314ba5d97d709cffe41bf992d9e">Full
Changelog</a>)</p>
<h3>Security patches</h3>
<ul>
<li>CVE-2026-42557 <a
href="https://github.com/jupyterlab/jupyterlab/security/advisories/GHSA-mqcg-5x36-vfcg">https://github.com/jupyterlab/jupyterlab/security/advisories/GHSA-mqcg-5x36-vfcg</a></li>
<li>CVE-2026-40171 <a
href="https://github.com/jupyter/notebook/security/advisories/GHSA-rch3-82jr-f9w9">https://github.com/jupyter/notebook/security/advisories/GHSA-rch3-82jr-f9w9</a></li>
</ul>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>Update to JupyterLab v4.5.7 <a
href="https://redirect.github.com/jupyter/notebook/pull/7902">#7902</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
</ul>
<h3>Documentation improvements</h3>
<ul>
<li>docs: Fix broken links in troubleshooting and migration docs <a
href="https://redirect.github.com/jupyter/notebook/pull/7824">#7824</a>
(<a
href="https://github.com/RamiNoodle733"><code>@​RamiNoodle733</code></a>)</li>
</ul>
<h3>Contributors to this release</h3>
<p>The following people contributed discussions, new ideas, code and
documentation contributions, and review.
See <a
href="https://github-activity.readthedocs.io/en/latest/use/#how-does-this-tool-define-contributions-in-the-reports">our
definition of contributors</a>.</p>
<p>(<a
href="https://github.com/jupyter/notebook/graphs/contributors?from=2026-03-11&amp;to=2026-04-30&amp;type=c">GitHub
contributors page for this release</a>)</p>
<p><a href="https://github.com/jtpio"><code>@​jtpio</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnotebook+involves%3Ajtpio+updated%3A2026-03-11..2026-04-30&amp;type=Issues">activity</a>)
| <a
href="https://github.com/RamiNoodle733"><code>@​RamiNoodle733</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnotebook+involves%3ARamiNoodle733+updated%3A2026-03-11..2026-04-30&amp;type=Issues">activity</a>)</p>
<h2>v7.5.5</h2>
<h2>7.5.5</h2>
<p>(<a
href="https://github.com/jupyter/notebook/compare/@jupyter-notebook/application-extension@7.5.4...4f8438b0c67dc4f010bf8cd052da4f16e2ed3828">Full
Changelog</a>)</p>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>Update to JupyterLab v4.5.6 <a
href="https://redirect.github.com/jupyter/notebook/pull/7861">#7861</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
<li>[7.5.x] Drop Python 3.9 on CI <a
href="https://redirect.github.com/jupyter/notebook/pull/7860">#7860</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
<li>Fix check links <a
href="https://redirect.github.com/jupyter/notebook/pull/7857">#7857</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
</ul>
<h3>Contributors to this release</h3>
<p>The following people contributed discussions, new ideas, code and
documentation contributions, and review.
See <a
href="https://github-activity.readthedocs.io/en/latest/use/#how-does-this-tool-define-contributions-in-the-reports">our
definition of contributors</a>.</p>
<p>(<a
href="https://github.com/jupyter/notebook/graphs/contributors?from=2026-02-24&amp;to=2026-03-11&amp;type=c">GitHub
contributors page for this release</a>)</p>
<p><a href="https://github.com/jtpio"><code>@​jtpio</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnotebook+involves%3Ajtpio+updated%3A2026-02-24..2026-03-11&amp;type=Issues">activity</a>)</p>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/jupyter/notebook/blob/@jupyter-notebook/tree@7.5.6/CHANGELOG.md">notebook's
changelog</a>.</em></p>
<blockquote>
<h2>7.5.6</h2>
<p>(<a
href="https://github.com/jupyter/notebook/compare/@jupyter-notebook/application-extension@7.5.5...2e642f0cb10be314ba5d97d709cffe41bf992d9e">Full
Changelog</a>)</p>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>Update to JupyterLab v4.5.7 <a
href="https://redirect.github.com/jupyter/notebook/pull/7902">#7902</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
</ul>
<h3>Documentation improvements</h3>
<ul>
<li>docs: Fix broken links in troubleshooting and migration docs <a
href="https://redirect.github.com/jupyter/notebook/pull/7824">#7824</a>
(<a
href="https://github.com/RamiNoodle733"><code>@​RamiNoodle733</code></a>)</li>
</ul>
<h3>Contributors to this release</h3>
<p>The following people contributed discussions, new ideas, code and
documentation contributions, and review.
See <a
href="https://github-activity.readthedocs.io/en/latest/use/#how-does-this-tool-define-contributions-in-the-reports">our
definition of contributors</a>.</p>
<p>(<a
href="https://github.com/jupyter/notebook/graphs/contributors?from=2026-03-11&amp;to=2026-04-30&amp;type=c">GitHub
contributors page for this release</a>)</p>
<p><a href="https://github.com/jtpio"><code>@​jtpio</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnotebook+involves%3Ajtpio+updated%3A2026-03-11..2026-04-30&amp;type=Issues">activity</a>)
| <a
href="https://github.com/RamiNoodle733"><code>@​RamiNoodle733</code></a>
(<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnotebook+involves%3ARamiNoodle733+updated%3A2026-03-11..2026-04-30&amp;type=Issues">activity</a>)</p>
<!-- raw HTML omitted -->
<h2>7.5.5</h2>
<p>(<a
href="https://github.com/jupyter/notebook/compare/@jupyter-notebook/application-extension@7.5.4...4f8438b0c67dc4f010bf8cd052da4f16e2ed3828">Full
Changelog</a>)</p>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>Update to JupyterLab v4.5.6 <a
href="https://redirect.github.com/jupyter/notebook/pull/7861">#7861</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
<li>[7.5.x] Drop Python 3.9 on CI <a
href="https://redirect.github.com/jupyter/notebook/pull/7860">#7860</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
<li>Fix check links <a
href="https://redirect.github.com/jupyter/notebook/pull/7857">#7857</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
</ul>
<h3>Contributors to this release</h3>
<p>The following people contributed discussions, new ideas, code and
documentation contributions, and review.
See <a
href="https://github-activity.readthedocs.io/en/latest/use/#how-does-this-tool-define-contributions-in-the-reports">our
definition of contributors</a>.</p>
<p>(<a
href="https://github.com/jupyter/notebook/graphs/contributors?from=2026-02-24&amp;to=2026-03-11&amp;type=c">GitHub
contributors page for this release</a>)</p>
<p><a href="https://github.com/jtpio"><code>@​jtpio</code></a> (<a
href="https://github.com/search?q=repo%3Ajupyter%2Fnotebook+involves%3Ajtpio+updated%3A2026-02-24..2026-03-11&amp;type=Issues">activity</a>)</p>
<h2>7.5.4</h2>
<p>(<a
href="https://github.com/jupyter/notebook/compare/@jupyter-notebook/application-extension@7.5.3...e5d8418b706fcefd4208bb61c22399dd3123555b">Full
Changelog</a>)</p>
<h3>Maintenance and upkeep improvements</h3>
<ul>
<li>Update to JupyterLab v4.5.5 <a
href="https://redirect.github.com/jupyter/notebook/pull/7842">#7842</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
<li>Fix PyO3 CI failure with Python 3.15 <a
href="https://redirect.github.com/jupyter/notebook/pull/7836">#7836</a>
(<a href="https://github.com/jtpio"><code>@​jtpio</code></a>)</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/jupyter/notebook/commit/1ab2d2b99261996e94069ca53dd3d74b8b2ee1ba"><code>1ab2d2b</code></a>
Publish 7.5.6</li>
<li><a
href="https://github.com/jupyter/notebook/commit/50e5222c9670121c3369900c7dce01aae53823fc"><code>50e5222</code></a>
Merge commit from fork</li>
<li><a
href="https://github.com/jupyter/notebook/commit/2e642f0cb10be314ba5d97d709cffe41bf992d9e"><code>2e642f0</code></a>
Update to JupyterLab v4.5.7 (<a
href="https://redirect.github.com/jupyter/notebook/issues/7902">#7902</a>)</li>
<li><a
href="https://github.com/jupyter/notebook/commit/4b93f98b5a6e57027a2e1d58694b56e2ebd793a3"><code>4b93f98</code></a>
Backport PR <a
href="https://redirect.github.com/jupyter/notebook/issues/7824">#7824</a>:
docs: Fix broken links in troubleshooting and migration do...</li>
<li><a
href="https://github.com/jupyter/notebook/commit/9a2c88fe646bac05b39dbe53e3e0ce95cafee016"><code>9a2c88f</code></a>
Publish 7.5.5</li>
<li><a
href="https://github.com/jupyter/notebook/commit/4f8438b0c67dc4f010bf8cd052da4f16e2ed3828"><code>4f8438b</code></a>
Update to JupyterLab v4.5.6 (<a
href="https://redirect.github.com/jupyter/notebook/issues/7861">#7861</a>)</li>
<li><a
href="https://github.com/jupyter/notebook/commit/f78fcfada85f9e4b46003bc1b831c83e6f4c30b3"><code>f78fcfa</code></a>
Backport PR <a
href="https://redirect.github.com/jupyter/notebook/issues/7857">#7857</a>:
Fix check links (<a
href="https://redirect.github.com/jupyter/notebook/issues/7858">#7858</a>)</li>
<li><a
href="https://github.com/jupyter/notebook/commit/9e4cf2a44594e650e1ae3da49f81ae420135f32f"><code>9e4cf2a</code></a>
[7.5.x] Drop Python 3.9 on CI (<a
href="https://redirect.github.com/jupyter/notebook/issues/7860">#7860</a>)</li>
<li><a
href="https://github.com/jupyter/notebook/commit/ecc3aaf1bbf8f9cbec9c5d85df79db0f62b6d1e6"><code>ecc3aaf</code></a>
Publish 7.5.4</li>
<li><a
href="https://github.com/jupyter/notebook/commit/e5d8418b706fcefd4208bb61c22399dd3123555b"><code>e5d8418</code></a>
Update to JupyterLab v4.5.5 (<a
href="https://redirect.github.com/jupyter/notebook/issues/7842">#7842</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/jupyter/notebook/compare/@jupyter-notebook/tree@7.5.2...@jupyter-notebook/tree@7.5.6">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=notebook&package-manager=pip&previous-version=7.5.2&new-version=7.5.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/NVIDIA/cuda-quantum/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 8fc90efbddd..190a5c50c89 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,7 +15,7 @@ pytest==9.0.3
 pytest-xdist==3.8.0
 psutil
 numpy==1.26.4
-notebook==7.5.2
+notebook==7.5.6
 nbconvert==7.17.1
 llvmlite==0.44.0
 scipy==1.16.3

From 73596a76e295f025509359d7dedd7b1c9fcaffb3 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Thu, 30 Apr 2026 17:29:30 -0700
Subject: [PATCH 85/85] [NFC] Let's follow the coding conventions (#4422)

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 include/cudaq/Optimizer/Transforms/Passes.td | 194 +++++++++++--------
 1 file changed, 111 insertions(+), 83 deletions(-)

diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index df9890e47b3..910eef5142a 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -11,6 +11,10 @@
 
 include "mlir/Pass/PassBase.td"
 
+// NB: Before approving changes to this file make sure the author understands
+// the passes are listed in alphabetical order. Changes that add passes in
+// non-alphabetical order should get a Request Changes.
+
 def AddMeasurements : Pass<"add-measurements", "mlir::func::FuncOp"> {
   let summary = "Add measurement operations.";
   let description = [{
@@ -272,50 +276,6 @@ def ConvertToDirectCalls : Pass<"indirect-to-direct-calls", "mlir::ModuleOp"> {
   }];
 }
 
-def ResourceCountPreprocess :
-    Pass<"resource-count-preprocess", "mlir::func::FuncOp"> {
-  let summary = "Performs preprocessing to optimize for resource counter.";
-  let description = [{
-    This pass performs preprocessing to optimize the code for the resource
-    counter by removing as many invariant gates as possible during compilation,
-    with the aim of reducing the program size for further compilation, and
-    speeding up resource counting by the simulator at runtime.
-
-    For example, in the following program:
-
-    ```c++
-      cudaq::qarray<10> q;
-      h(q[0]);
-      for (size_t i = 0; i < 10; i++) {
-        y(q[i]);
-        if (mz(q[i]))
-          x(q[i]);
-      }
-    ```
-
-    The initial `h(q[0])` will always be run once, so we can precount it and
-    remove it. Similarly, it is trivial to see that the loop will run 10 times,
-    so we can pre count `y(q[i])` ten times as well. However, because `x(q[i])`
-    is gated by `mz(q[i])`, it is not invariant, so it will remain and be
-    counted during simulation.
-
-    Currently, the detection of "invariance" is purposefully dumb.
-
-    This preprocessing path may interfere with detecting downstream
-    optimizations, as preprocessed gates are counted and removed early in the
-    pipeline.
-  }];
-
-  let options = [
-    Option<"countGate", "count-gate",
-      "std::function<void(std::string,std::vector<std::size_t>,std::vector<std::size_t>,size_t)>",
-      /*default=*/"[](std::string,std::vector<std::size_t>,std::vector<std::size_t>,size_t){}",
-      "Closure that receives gate name, control qubit indices, target qubit indices, and multiplicity">,
-    Option<"dumpPreprocessed", "dump-preprocessed", "bool", /*default=*/"false",
-      "Dump preprocessed gates instead of counting them (useful for testing)">,
-  ];
-}
-
 def DeadStoreRemoval : Pass<"dead-store-removal"> {
   let summary = "Dead store removal (DSR).";
   let description = [{
@@ -425,9 +385,9 @@ def DistributedDeviceCall : Pass<"distributed-device-call", "mlir::ModuleOp"> {
 
   let options = [
     Option<"insertTrapImplementation", "insert-trap", "bool", /*default=*/"false",
-      "Enable pass to insert a default implementation (runtime trap) for device call "
-      "(default=false). The default implementation has weak linkage, i.e., " 
-      "allowed to be overridden at link time.">
+      "Enable pass to insert a default implementation (runtime trap) for "
+      "device call (default=false). The default implementation has weak "
+      "linkage, i.e., allowed to be overridden at link time.">
   ];
 }
 
@@ -464,7 +424,7 @@ def ExpandControlVeqs: Pass<"expand-control-veqs", "mlir::func::FuncOp"> {
   let description = [{
     Given an operation of the form
     ```mlir
-    quake.any [%veq] %r : (!quake.veq<n>, !quake.ref) -> ()
+    quake.* [%veq] %r : (!quake.veq<n>, !quake.ref) -> ()
     ```
     this pass will extract each qubit from `%veq%` and explicitly provide them
     as qubits:
@@ -472,7 +432,8 @@ def ExpandControlVeqs: Pass<"expand-control-veqs", "mlir::func::FuncOp"> {
     %arg0 = quake.extract_ref %veq[0] : (!quake.veq<n>) -> !quake.ref
     ...
     %argn = quake.extract_ref %veq[n] : (!quake.veq<n>) -> !quake.ref
-    quake.any [%arg0, ..., %argn] %0 : (!quake.ref, ..., !quake.ref, !quake.ref) -> ()
+    quake.* [%arg0, ..., %argn] %0 : (!quake.ref, ..., !quake.ref,
+        !quake.ref) -> ()
     ```
   }];
 }
@@ -597,7 +558,8 @@ def GenerateKernelExecution : Pass<"kernel-execution", "mlir::ModuleOp"> {
       "Defer generating kernel execution until JIT time.">,
     Option<"positNullary", "posit-nullary", "bool", /*default=*/"false",
       "Posit that the entry-point is nullary regardless of its signature.">,
-    Option<"ignoreHostFunction", "ignore-host-function", "bool", /*default=*/"false",
+    Option<"ignoreHostFunction", "ignore-host-function", "bool",
+      /*default=*/"false",
       "Assume that the kernel doesn't need a host function.">
   ];
 }
@@ -615,20 +577,28 @@ def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::ModuleOp"> {
     Example:
     ```mlir
       module {
-        func.func @__nvqpp__mlirgen__function_foo_generator_1.bar(%arg0: !cc.stdvec<f64>) -> !cc.stdvec<complex<f64>> {
+        func.func @__nvqpp__mlirgen__function_foo_generator_1.bar(%arg0:
+            !cc.stdvec<f64>) -> !cc.stdvec<complex<f64>> {
           ...
-          %0 = cc.address_of @__nvqpp__mlirgen__function_foo_generator_1.bar.rodata_0 : !cc.ptr<!cc.array<complex<f64> x 4>>
+          %0 = cc.address_of
+              @__nvqpp__mlirgen__function_foo_generator_1.bar.rodata_0 :
+              !cc.ptr<!cc.array<complex<f64> x 4>>
           ...
           return %3 : !cc.stdvec<complex<f64>>
         }
 
         func.func @__nvqpp__mlirgen__function_kernel_1._Z8kernel_1v() {
           %0 = quake.alloca !quake.ref
-          quake.custom_op @__nvqpp__mlirgen__function_foo_generator_1.bar %0 : (!quake.ref) -> ()
+          quake.custom_op @__nvqpp__mlirgen__function_foo_generator_1.bar %0 :
+              (!quake.ref) -> ()
           return
         }
 
-        cc.global constant private @__nvqpp__mlirgen__function_foo_generator_1.bar.rodata_0 ((dense<[(0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
+        cc.global constant private
+          @__nvqpp__mlirgen__function_foo_generator_1.bar.rodata_0
+          ((dense<[(0.000000e+00,0.000000e+00), (1.000000e+00,0.000000e+00),
+            (1.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> :
+              tensor<4xcomplex<f64>>) : !cc.array<complex<f64> x 4>
       }
     ```
 
@@ -637,7 +607,9 @@ def GetConcreteMatrix : Pass<"get-concrete-matrix", "mlir::ModuleOp"> {
     ```mlir
       func.func @__nvqpp__mlirgen__function_kernel_1._Z8kernel_1v() {
         %0 = quake.alloca !quake.ref
-        quake.custom_op @__nvqpp__mlirgen__function_foo_generator_1.bar.rodata_0 %0 : (!quake.ref) -> ()
+        quake.custom_op
+            @__nvqpp__mlirgen__function_foo_generator_1.bar.rodata_0 %0 :
+              (!quake.ref) -> ()
         return
       }
     ```
@@ -901,7 +873,8 @@ def MappingFunc: Pass<"qubit-mapping-func", "mlir::func::FuncOp"> {
 
   let options = [
     Option<"device", "device", "std::string", /*default=*/"\"-\"",
-      "Device topology: path(N), ring(N), star(N), star(N,c), grid(w,h), file(/path/to/file), bypass">,
+      "Device topology: path(N), ring(N), star(N), star(N,c), grid(w,h), "
+      "file(/path/to/file), bypass">,
     Option<"extendedLayerSize", "extendedLayerSize", "unsigned",
            /*default=*/"20", "Extended layer size">,
     Option<"extendedLayerWeight", "extendedLayerWeight", "float",
@@ -982,7 +955,8 @@ def ObserveAnsatz : Pass<"observe-ansatz", "mlir::func::FuncOp"> {
   }];
   let options = [
     ListOption<"termBSF", "term-bsf", "unsigned",
-      "The measurement bases as a Pauli tensor product represented in binary symplectic form.">
+      "The measurement bases as a Pauli tensor product represented in binary "
+      "symplectic form.">
   ];
 }
 
@@ -1130,6 +1104,21 @@ def QuakeSimplify : Pass<"quake-simplify"> {
   }];
 }
 
+def QubitResetBeforeReuse :
+    Pass<"qubit-reset-before-reuse", "mlir::func::FuncOp"> {
+  let summary = "After measurement of a ref, add reset (and initialization).";
+  let description = [{
+   Adds a `quake.reset` and conditionally applies an X gate if the measurement
+   result is 1 to initialize the reference back into the just measured state
+   afterward. This is only activated when the measured reference is to be
+   reused.
+   
+   Note: if the measurement is already accompanied by a reset, this pass does
+   not add any additional operations.
+  }];
+
+}
+
 def RegToMem : Pass<"regtomem", "mlir::func::FuncOp"> {
   let summary = "Converts register-SSA to memory-SSA form.";
   let description = [{
@@ -1188,12 +1177,13 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel",
     The argument synthesis also substituted the state argument in the `caller`
     with:
       ```
-      quake.materialize_state @callee_num_qubits @callee_init: !cc.ptr<!quake.state>
+      quake.materialize_state @callee_num_qubits @callee_init:
+          !cc.ptr<!quake.state>
       ```
 
-    This optimization performs the replacements for the the following operations 
-    that use a state produced by  `quake.materialize_state @num_qubits, @init`
-    operation:
+    This optimization performs the replacements for the the following
+    operations that use a state produced by  `quake.materialize_state
+    @num_qubits, @init` operation:
 
     - Replace `quake.get_number_of_qubits` operation by call to `@num_qubits`
     - Replace `quake.init_state` operation by call to `@init`
@@ -1204,10 +1194,12 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel",
     Before ReplaceStateWithKernel (replace-state-with-kernel):
     ```
     func.func @foo() {
-      %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0: !cc.ptr<!quake.state>
+      %0 = quake.materialize_state @callee.num_qubits_0, @callee.init_0:
+              !cc.ptr<!quake.state>
       %1 = quake.get_number_of_qubits %0 : (!cc.ptr<!quake.state>) -> i64
       %2 = quake.alloca !quake.veq<?>[%1 : i64]
-      %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!quake.state>) -> !quake.veq<?>
+      %3 = quake.init_state %2, %0 : (!quake.veq<?>, !cc.ptr<!quake.state>)
+              -> !quake.veq<?>
       return
     }
     ```
@@ -1224,6 +1216,50 @@ def ReplaceStateWithKernel : Pass<"replace-state-with-kernel",
   }];
 }
 
+def ResourceCountPreprocess :
+    Pass<"resource-count-preprocess", "mlir::func::FuncOp"> {
+  let summary = "Performs preprocessing to optimize for resource counter.";
+  let description = [{
+    This pass performs preprocessing to optimize the code for the resource
+    counter by removing as many invariant gates as possible during compilation,
+    with the aim of reducing the program size for further compilation, and
+    speeding up resource counting by the simulator at runtime.
+
+    For example, in the following program:
+
+    ```c++
+      cudaq::qarray<10> q;
+      h(q[0]);
+      for (size_t i = 0; i < 10; i++) {
+        y(q[i]);
+        if (mz(q[i]))
+          x(q[i]);
+      }
+    ```
+
+    The initial `h(q[0])` will always be run once, so we can precount it and
+    remove it. Similarly, it is trivial to see that the loop will run 10 times,
+    so we can pre count `y(q[i])` ten times as well. However, because `x(q[i])`
+    is gated by `mz(q[i])`, it is not invariant, so it will remain and be
+    counted during simulation.
+
+    Currently, the detection of "invariance" is purposefully dumb.
+
+    This preprocessing path may interfere with detecting downstream
+    optimizations, as preprocessed gates are counted and removed early in the
+    pipeline.
+  }];
+
+  let options = [
+    Option<"countGate", "count-gate",
+      "std::function<void(std::string,std::vector<std::size_t>,std::vector<std::size_t>,size_t)>",
+      /*default=*/"[](std::string,std::vector<std::size_t>,std::vector<std::size_t>,size_t){}",
+      "Closure that receives gate name, control qubit indices, target qubit indices, and multiplicity">,
+    Option<"dumpPreprocessed", "dump-preprocessed", "bool", /*default=*/"false",
+      "Dump preprocessed gates instead of counting them (useful for testing)">,
+  ];
+}
+
 def SROA : Pass<"cc-sroa"> {
   let summary = "Scalar replacement of aggregates.";
   let description = [{
@@ -1276,10 +1312,14 @@ def StatePreparation : Pass<"state-prep", "mlir::func::FuncOp"> {
       func.func @foo() attributes {
         %0 = cc.address_of @foo.rodata_0 : !cc.ptr<!cc.array<complex<f32> x 4>>
         %1 = quake.alloca !quake.veq<2>
-        %2 = quake.init_state %1, %0 : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
+        %2 = quake.init_state %1, %0 : (!quake.veq<2>,
+          !cc.ptr<!cc.array<complex<f32> x 4>>) -> !quake.veq<2>
         return
       }
-      cc.global constant private @foo.rodata_0 (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00), (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> : tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
+      cc.global constant private @foo.rodata_0
+        (dense<[(0.707106769,0.000000e+00), (0.707106769,0.000000e+00),
+          (0.000000e+00,0.000000e+00), (0.000000e+00,0.000000e+00)]> :
+            tensor<4xcomplex<f32>>) : !cc.array<complex<f32> x 4>
     }
     ```
     Will be rewritten to:
@@ -1322,15 +1362,18 @@ def UnitarySynthesis : Pass<"unitary-synthesis", "mlir::ModuleOp"> {
 
     For example,
     ```mlir
-    quake.custom_op @__nvqpp__mlirgen__custom_x_generator_1.rodata %0 : (!quake.ref) -> ()
+    quake.custom_op @__nvqpp__mlirgen__custom_x_generator_1.rodata %0 :
+      (!quake.ref) -> ()
     ```
     This call will be converted to an `ApplyOp`.
     ```mlir
-    quake.apply @__nvqpp__mlirgen__custom_x_generator_1.kernel %[[VAL_0]] : (!quake.ref) -> ()
+    quake.apply @__nvqpp__mlirgen__custom_x_generator_1.kernel %[[VAL_0]] :
+      (!quake.ref) -> ()
     ```
     The function has sequence of decomposed gates.
     ```mlir
-    func.func private @__nvqpp__mlirgen__custom_x_generator_1.kernel(%arg0: !quake.ref) {
+    func.func private @__nvqpp__mlirgen__custom_x_generator_1.kernel(%arg0:
+      !quake.ref) {
         %cst = arith.constant 1.5707963267948966 : f64        
         quake.ry (%cst) %arg0 : (f64, !quake.ref) -> ()
         ...
@@ -1413,19 +1456,4 @@ def WriteAfterWriteElimination : Pass<"write-after-write-elimination"> {
   }];
 }
 
-def QubitResetBeforeReuse :
-    Pass<"qubit-reset-before-reuse", "mlir::func::FuncOp"> {
-  let summary = "After measurement of a ref, add reset (and initialization).";
-  let description = [{
-   Adds a `quake.reset` and conditionally applies an X gate if the measurement
-   result is 1 to initialize the reference back into the just measured state
-   afterward. This is only activated when the measured reference is to be
-   reused.
-   
-   Note: if the measurement is already accompanied by a reset, this pass does
-   not add any additional operations.
-  }];
-
-}
-
 #endif // CUDAQ_OPT_OPTIMIZER_TRANSFORMS_PASSES